/**
 *-----------------------------------------------------------------------------------
 *    Filename: ADFFixedMath.c
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 *    Copyright 2004-2007 Mitsubishi Electric Research Laboratories (MERL)
 *    An implementation of fixed point math functions
 *    Eric Chan, Ronald Perry, and Sarah Frisken
 *-----------------------------------------------------------------------------------
 */

/**
 *-----------------------------------------------------------------------------------
 *    START: iType ADF Rendering
 *-----------------------------------------------------------------------------------
 */
#include "fs_itype.h"

#if defined(FS_EDGE_HINTS) || defined(FS_EDGE_RENDER)

/**
 *-----------------------------------------------------------------------------------
 *    Required include files for this implementation
 *-----------------------------------------------------------------------------------
 */
#include "adftypesystem.h"
#include "adffixedmath.h"


/**
 *-----------------------------------------------------------------------------------
 *    START: FIXED POINT MATH ONLY
 *-----------------------------------------------------------------------------------
 */

/**
 *-----------------------------------------------------------------------------------
 *    TEXTBOOK REFERENCE
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 *    The documentation in this file often refers to the textbook "ARM System
 *    Developer's Guide: Designing and Optimizing System Software" by Andrew N. Sloss,
 *    Dominic Symes, and Chris Wright. The textbook has ISBN 1-55860-874-5 and is
 *    published by Morgan Kaufmann Publishers.
 *-----------------------------------------------------------------------------------
 */

#ifdef FS_EDGE_RENDER

/**
 *-----------------------------------------------------------------------------------
 *    NEWTON-RAPHSON METHOD OVERVIEW
 *-----------------------------------------------------------------------------------
 *    Newton-Raphson iteration is a method for solving equations numerically. Given a
 *    good initial approximation of the solution to an equation, Newton-Raphson
 *    iteration converges rapidly on that solution. Convergence is usually quadratic
 *    with the number of valid bits in the result roughly doubling with each iteration.
 *
 *    The Newton-Raphson iteration method applies to any equation of the form f(x) = 0,
 *    where f(x) is a differentiable function with derivative f'(x). The method begins
 *    with an approximation x{i} to a solution x of the equation. Then the following
 *    iterative equation is applied to obtain a better approximation x{i+1}:
 *
 *      x{i+1} = x{i} - (f(x{i}) / f'(x{i}))
 *
 *    For example, consider using the Newton-Raphson iteration method to solve the
 *    equation f(x) = 0.64 - x^2 = 0. The derivative of f(x) is f'(x) = -2*x.
 *    Therefore, the Newton-Raphson iteration equation to solve f(x) is:
 *
 *      x{i+1} = x{i} - ((0.64 - x{i}^2) / (-2 * x{i}))
 *             = (0.32 / x{i}) + (0.5 * x{i})
 *
 *    Let x{0} = 1 be the initial approximation to the solution of f(x) = 0. Applying
 *    the above formula once produces a better approximation x{1}:
 *
 *      x{1}     = (0.32 / x{0}) + (0.5 * x{0})
 *             = (0.32 / 1)     + (0.5 * 1)
 *             = 0.82
 *
 *    Applying the above formula again produces an even better approximation x{2}:
 *
 *      x{2}     = (0.32 / x{1}) + (0.5 * x{1})
 *             = (0.32 / 0.82) + (0.5 * 0.82)
 *            ~= 0.8002439
 *
 *    With repeated applications of the above formula, the Newton-Raphson iteration
 *    converges to the true solution: 0.8.
 *
 *    The Newton-Raphson iteration method is used by this fixed point math
 *    implementation to compute reciprocal square roots. The reciprocal square root of
 *    n is rsqrt(n) = 1 / sqrt(n), which is equivalent to n^(-0.5). Computing the
 *    reciprocal square root can be accomplished by solving the equation f(x) = n -
 *    x^(-2) = 0, which has a positive solution x = n^(-0.5). The derivative of f(x) is
 *    f'(x) = 2 * x^(-3). Therefore, the Newton-Raphson iteration equation to solve
 *    f(x) = 0 is:
 *
 *      x{i+1} = x{i} - ((n - x{i}^(-2)) / (2 * x{i}^(-3)))
 *             = 0.5 * x{i} * (3 - n * x{i}^2)
 *
 *    For example, consider using the Newton-Raphson iteration method to compute the
 *    reciprocal square root of 9. The Newton-Raphson iteration equation is:
 *
 *      x{i+1} = 0.5 * x{i} * (3 - n * x{i}^2)
 *             = 0.5 * x{i} * (3 - 9 * x{i}^2)
 *             = 1.5 * x{i} * (1 - 3 * x{i}^2)
 *
 *    Let x{0} = 0.3 be the initial approximation to the solution of f(x) = 0. Applying
 *    the above formula once produces a better approximation x{1}:
 *
 *      x{1}     = 1.5 * x{0} * (1 - 3 * x{0}^2)
 *             = 1.5 * 0.3  * (1 - 3 * 0.3^2)
 *             = 0.3285
 *
 *    Applying the above formula again produces an even better approximation x{2}:
 *
 *      x{2}     = 1.5 * x{1}    * (1 - 3 * x{1}^2)
 *             = 1.5 * 0.3285 * (1 - 3 * 0.3285^2)
 *            ~= 0.3332287
 *
 *    With repeated applications of the above formula, the Newton-Raphson iteration
 *    converges to the true solution: 1/3.
 *
 *    The Newton-Raphson iteration method is also used by this fixed point math
 *    implementation to compute reciprocals. The reciprocal of n is 1 / n, which is
 *    equivalent to n^(-1). Computing the reciprocal can be accomplished by solving the
 *    equation f(x) = n - x^(-1) = 0, which has a solution x = n^(-1). The derivative
 *    of f(x) is f'(x) = x^(-2). Therefore, the Newton-Raphson iteration equation to
 *    solve f(x) = 0 is:
 *
 *      x{i+1} = x{i} - ((n - x{i}^(-1)) / (x^(-2)))
 *             = x{i} * (2 - n * x{i})
 *
 *    For example, consider using the Newton-Raphson iteration method to compute the
 *    reciprocal of 0.8. The Newton-Raphson iteration equation is:
 *
 *      x{i+1} = x{i} * (2 - n * x{i})
 *             = x{i} * (2 - 0.8 * x{i})
 *
 *    Let x{0} = 1 be the initial approximation to the solution of f(x) = 0. Applying
 *    the above formula once produces a better approximation x{1}:
 *
 *      x{1}     = x{0} * (2 - 0.8 * x{0})
 *             = 1    * (2 - 0.8 * 1)
 *             = 1.2
 *
 *    Applying the above formula again produces an even better approximation x{2}:
 *
 *      x{2}     = x{1} * (2 - 0.8 * x{1})
 *             = 1.2    * (2 - 0.8 * 1.2)
 *             = 1.248
 *
 *    With repeated applications of the above formula, the Newton-Raphson iteration
 *    converges to the true solution: 1.25.
 *
 *    When using the Newton-Raphson iteration method to solve an equation f(x) = 0, it
 *    is important to start with a good initial approximation to the solution. With a
 *    poor initial approximation, the Newton-Raphson iteration may converge very slowly
 *    or, even worse, may not converge at all. The reciprocal square root and
 *    reciprocal implementations in this file obtain good initial approximations by
 *    indexing into small lookup tables based on the leading bits of the input value.
 *    The table lookups are followed by two iterations of Newton-Raphson. Proofs of the
 *    correctness of this approach are given in Chapter 7 of ARM System Developer's
 *    Guide.
 *
 *    To learn more about the Newton-Raphson iteration method and efficient fixed point
 *    implementations, refer to Chapter 7 of ARM System Developer's Guide.
 *-----------------------------------------------------------------------------------
 */


/**
 *-----------------------------------------------------------------------------------
 *    RECIPROCAL SQUARE ROOT FUNCTIONS
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 *    Compute and return the reciprocal square root of n (i.e., 1 / sqrt(n)), where the
 *    input n is a 0.32 fixed point value that lies in the mathematical range [0.25,
 *    1). Considered as an unsigned integer, n must satisfy n >= 2^30. The computed
 *    result is a 2.30 fixed point value whose leading 1 bit is at bit position 30. The
 *    computed result is rounded towards negative infinity.
 *
 *    If the input n is less than or equal to zero, the result is undefined.
 *
 *    All assembly and C implementations produce bit-identical results.
 *
 *    Implementation notes:
 *
 *      - The reciprocal square root estimation method consists of a table lookup
 *        followed by two iterations of Newton-Raphson. The Newton-Raphson iteration
 *        equation to compute a reciprocal square root is x{i+1} = 0.5 * x{i} * (3 - n
 *        * x{i}^2), where x{i} is the estimate of the solution in the current
 *        iteration (i.e., iteration i) and x{i+1} is the computed estimate in the next
 *        iteration (i.e., iteration (i + 1)). See the above section Newton-Raphson
 *        Method Overview for an overview of the theory behind this numerical technique
 *        and for the derivation of the above formula.
 *
 *      - A table lookup on the leading bits of the input n is used to obtain an
 *        initial approximation to the reciprocal square root of n. See the
 *        documentation for rsqTable[] below for more details about the table.
 *
 *      - The documentation for the C implementation below uses a running example to
 *        help explain the fixed point implementation. This running example considers
 *        the case where the input value n is the 0.32 fixed point value 0x80000000,
 *        which has the mathematical value (0x80000000 / 2^32) = 0.5.
 *
 *    Performance notes:
 *
 *      Intel Centrino Core Duo T2500 (2 MB L2, 2.0 GHz, FSB 677 MHz): MSVC 6 compiler,
 *      Release mode:
 *        - ADF_MATH_FIXED_C_64     is ~1.7x as fast as ADF_MATH_FIXED_C_32
 *        - ADF_MATH_FIXED_ASM_X86 is ~1.4x as fast as ADF_MATH_FIXED_C_64
 *        - ADF_MATH_FIXED_ASM_X86 is ~2.4x as fast as ADF_MATH_FIXED_C_32
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 *    rsqTable[] is the reciprocal square root lookup table used by the RSQ() function.
 *    The table contains 96 elements and is precomputed as follows:
 *
 *    rsqTable[i] = round(256.0 / sqrt((i + 32.3) / 128.0)) - 256, where i is an
 *    integer that lies in the range [0, 95]. The following discussion explains this
 *    formula.
 *
 *    The purpose of rsqTable[] is to provide a fast and reasonably accurate initial
 *    estimate to the reciprocal square root of a number n.
 *
 *    Assume that n is a 0.32 fixed point value that lies in the mathematical range
 *    (0.25, 1). Considered as an unsigned integer, n is greater than 2^30. Therefore,
 *    either bit 31 or bit 30 of n is 1.
 *
 *    The reciprocal square root of each value in the range (0.25, 1) can be expressed
 *    as a 24.8 fixed point value, where the integer portion of the 24.8 fixed point
 *    value is always 1. For example, the reciprocal square root of 0.5 can be
 *    expressed as the 24.8 fixed point value 0x16a, which has the mathematical value
 *    1.4140625. Consequently, the table only needs to store the low 8 bits (i.e., the
 *    fractional bits) of the reciprocal square root of each value in the range (0.25,
 *    1). Storing the integer portion of the 24.8 fixed point value in the table is
 *    unnecessary because it is always 1.
 *
 *    For a good tradeoff between table size and the accuracy of the initial estimate,
 *    this implementation uses the leading seven fractional bits of n to index into the
 *    table, thereby limiting the table size to 128 elements.
 *
 *    Observe that the index into the table is the 7-bit unsigned integer obtained from
 *    the leading seven fractional bits of n. Since either bit 6 (i.e., the MSB) or bit
 *    5 of the 7-bit unsigned integer index is 1, the value of the index must be at
 *    least 32. Therefore, the index lies in the range [32, 127]. As a result, the
 *    table only needs to contain 127 - 32 + 1 = 96 elements and can be indexed using
 *    integers that lie in the range [0, 95].
 *
 *    Computing the table is accomplished with the following steps:
 *
 *    First, map an integer index i in the range [0, 95] to the range [0.25234375,
 *    0.99453125], which is an approximation to the range (0.25, 1):
 *
 *      (i + 32.3) / 128.0
 *
 *    The value of 32.3 (instead of 32.0) is chosen for technical reasons explained
 *    below. Next, compute the reciprocal square root:
 *
 *      1 / sqrt((i + 32.3) / 128.0)
 *
 *    Scale the result by 256.0 and round to the nearest integer to obtain a 24.8 fixed
 *    point value:
 *
 *      round(256.0 * sqrt((i + 32.3) / 128.0))
 *
 *    Finally, subtract (i.e., remove) the integer portion of the 24.8 fixed point
 *    value because the integer portion is always 1. Putting all the steps together
 *    yields the following formula:
 *
 *      rsqTable[i] = round(256.0 / sqrt((i + 32.3) / 128.0)) - 256
 *
 *    The reason for choosing 32.3 instead of 32.0 is to handle the case where the
 *    index i is zero. Suppose the value of 32.0 is used instead. Then
 *
 *      rsqTable[0] = round(256.0 / sqrt((0 + 32.0) / 128.0)) - 256
 *                  = round(256.0 / sqrt(0.25)) - 256
 *                  = round(256.0 / 0.5) - 256
 *                  = 512 - 256
 *                  = 256
 *
 *    The value 256 requires 9 bits to store. By choosing a value slightly larger than
 *    32.0, such as 32.3, the following result is obtained:
 *
 *      rsqTable[0] = round(256.0 / sqrt((0 + 32.3) / 128.0)) - 256
 *                  = round(256.0 / sqrt(0.25234375)) - 256
 *                 ~= round(256.0 / 0.5023383) - 256
 *                  = 510 - 256
 *                  = 254
 *
 *    The value 254 requires only 8 bits to store. Therefore the entire table can be
 *    stored in just 96 bytes. The error in the initial estimate introduced by choosing
 *    32.3 instead of 32.0 is corrected by the Newton-Raphson iterations following the
 *    table lookup.
 *
 *    Although the above discussion has assumed that n lies in the mathematical range
 *    (0.25, 1), the RSQ() function uses rsqTable[] to compute reciprocal square roots
 *    of values that lie in the mathematical range [0.25, 1) (note the inclusion of
 *    0.25). The first element of rsqTable[] (i.e., rsqTable[0]) is an estimate of the
 *    reciprocal square root of 0.25.
 *-----------------------------------------------------------------------------------
 */
static FS_CONST ADF_U8 rsqTable[] = {
    0xfe, 0xf6, 0xef, 0xe7, 0xe1, 0xda, 0xd4, 0xce, 
    0xc8, 0xc3, 0xbd, 0xb8, 0xb3, 0xae, 0xaa, 0xa5, 
    0xa1, 0x9c, 0x98, 0x94, 0x90, 0x8d, 0x89, 0x85, 
    0x82, 0x7f, 0x7b, 0x78, 0x75, 0x72, 0x6f, 0x6c, 
    0x69, 0x66, 0x64, 0x61, 0x5e, 0x5c, 0x59, 0x57, 
    0x55, 0x52, 0x50, 0x4e, 0x4c, 0x49, 0x47, 0x45, 
    0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3a, 0x38, 0x36, 
    0x34, 0x32, 0x31, 0x2f, 0x2d, 0x2c, 0x2a, 0x29, 
    0x27, 0x26, 0x24, 0x23, 0x21, 0x20, 0x1e, 0x1d, 
    0x1c, 0x1a, 0x19, 0x18, 0x16, 0x15, 0x14, 0x13, 
    0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0b, 0x0a, 0x09, 
    0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 
};
/**
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 */
static ADF_U32 RSQ (ADF_U32 n)
{
#if ((ADF_MATH_MODE == ADF_MATH_FIXED_C_32) || \
    (ADF_MATH_MODE == ADF_MATH_FIXED_C_64))
    {
        /**
         *----C implementation (32-bit and 64-bit). x0 is the initial estimate
         *----obtained by a table lookup. x1 is the estimate obtained after the first
         *----Newton-Raphson iteration. x2 is the estimate obtained after the second
         *----Newton-Raphson iteration.
         */
        ADF_U32 x0, x1, x2;
            

        /**
         *----Perform a table lookup on the leading 7 bits of n to obtain the 8
         *----fractional bits of a 24.8 fixed point estimate to rsq(n) and add the
         *----integer portion of the 24.8 fixed point estimate (i.e., 0x100), which
         *----always has the mathematical value 1. The result is a 24.8 fixed point
         *----estimate to rsq(n). Running example: consider the case where n is
         *----0x80000000. Considered as a 0.32 fixed point value, n has the
         *----mathematical value (0x80000000 / 2^32) = 0.5. x0 = rsqTable[(n >> 25) -
         *----32] + 0x100 = rsqTable[(0x80000000 >> 25) - 32] + 0x100 = rsqTable[32]
         *----+ 0x100 = 0x69 + 0x100 = 0x169, which has the mathematical value (0x169
         *----/ 2^8) = 1.41015625. This is the initial estimate of the reciprocal
         *----square root of 0.5.
         */
        x0 = rsqTable[(n >> 25) - 32] + 0x100;
    

        /**
         *----Begin the first Newton-Raphson iteration. Compute the 16.16 fixed point
         *----value (x0 * x0). Running example: x1 = x0 * x0 = 0x169 * 0x169 =
         *----0x1fd11, which has the mathematical value (0x1fd11 / 2^16) =
         *----1.9885406494140625.
         */
        x1 = x0 * x0;


        /**
         *----Convert x0 from a 24.8 fixed point value to a 17.15 fixed point value.
         *----Running example: x0 = (x0 << 7) = (0x169 << 7) = 0xb480, which has the
         *----mathematical value (0xb480 / 2^8) = 1.41015625.
         */
        x0 <<= 7;


        /**
         *----Convert n from a 0.32 fixed point value to a 17.15 fixed point value
         *----and compute the 17.15 fixed point value (n * x0 * x0). Running example:
         *----n >> 17 = 0x80000000 >> 17 = 0x4000. (n >> 17) * x1 = 0x4000 * 0x1fd11
         *----= 0x000000007f444000. Extracting the middle 32 bits of the 64-bit
         *----product yields x1 = 0x00007f44, which has the mathematical value
         *----(0x7f44 / 2^15) = 0.9942626953125.
         */
        x1 = UMUL64_MID32(n >> 17, x1);


        /**
         *----Compute the 17.15 fixed point value (3 - (n * x0 * x0)). Running
         *----example: x1 = (3 << 15) - x1 = 0x18000 - 0x7f44 = 0x100bc, which has
         *----the mathematical value (0x100bc / 2^15) = 2.0057373046875.
         */
        x1 = (3 << 15) - x1;

    
        /**
         *----Compute the 1.31 fixed point value (0.5 * x0 * (3 - (n * x0 * x0))).
         *----Note that the multiplication below of two 17.15 fixed point values
         *----produces a 64-bit 34.30 fixed point value. Only the low 32 bits of the
         *----64-bit product are kept, thereby truncating the 34.30 fixed point value
         *----to a 2.30 fixed point value. The multiplication by 0.5 effectively
         *----shifts the binary point to the left by one bit, which makes x1 a 1.31
         *----fixed point value. This is the end of the first Newton-Raphson
         *----iteration. Running example: x1 = x0 * x1 = 0xb480 * 0x100bc =
         *----0xb5048e00, which has the mathematical value (0xb5048e00 / 2^310 =
         *----1.4142014980316162109375. Note that x1 is a better estimate of the
         *----reciprocal square root of 0.5 than x0.
         */
        x1 = x0 * x1;


        /**
         *----Begin the second Newton-Raphson iteration. Compute the 1.31 fixed point
         *----value (n * x1). Running example: x2 = (n * x1) = (0x80000000 *
         *----0xb5048e00) = 0x5a82470000000000. Keeping only the high 32 bits of the
         *----64-bit product yields the result x2 = 0x5a824700, which has the
         *----mathematical value (0x5a824700 / 2^31) = 0.70710074901580810546875.
         */
        x2 = UMUL64_HIGH32(n, x1);


        /**
         *----Compute the 2.30 fixed point value (n * x1 * x1). Running example: x2 =
         *----(x2 * x1) = (0x5a824700 * 0xb5048e00) = 0x3fffb8705f620000. Keeping
         *----only the high 32 bits of the 64-bit product yields the result x2 =
         *----0x3fffb870, which has the mathematical value (0x3fffb870 / 2^30) =
         *----0.99998293817043304443359375.
         */
        x2 = UMUL64_HIGH32(x2, x1);


        /**
         *----Compute the 2.30 fixed point value (3 - (n * x1 * x1)). Running
         *----example: x2 = (3 << 30) - x2 = (0xc0000000) - (0x3fffb870) =
         *----0x80004790, which has the mathematical value (0x80004790 / 2^30) =
         *----2.00001706182956695556640625.
         */
        x2 = ((ADF_U32)3 << 30) - x2;


        /**
         *----Compute the 2.30 fixed point value (0.5 * x1 * (3 - (d * x1 * x1))).
         *----Note that the multiplication below of a 1.31 fixed point value and a
         *----2.30 fixed point value produces a 64-bit 3.61 fixed point value. Only
         *----the high 32 bits of the 64-bit product are kept, thereby truncating the
         *----3.61 fixed point value to a 3.29 fixed point value. The multiplication
         *----by 0.5 effectively shifts the binary point to the left by one bit,
         *----which makes x2 a 2.30 fixed point value. This is the end of the second
         *----Newton-Raphson iteration. Running example: x2 = (x1 * x2) = (0xb5048e00
         *----* 0x80004790) = 0x5a82799a15f1e000. Keeping only the high 32 bits of
         *----the 64-bit product yields the result 0x5a82799a, which has the
         *----mathematical value (0x5a82799a / 2^30) =
         *----1.41421356238424777984619140625. The true reciprocal square root of 0.5
         *----is approximately 1.4142135623730950488016887242097. In this example,
         *----the computed result x2 is accurate to within 0.000000000012 of the true
         *----answer.
         */
        x2 = UMUL64_HIGH32(x1, x2);


        /**
         *----Return the result
         */
        return(x2);
    }
#elif (ADF_MATH_MODE == ADF_MATH_FIXED_ASM_X86)
    {
        /**
         *----x86 assembly implementation
         */
        __asm {

        
            /**
             *----Set ebx to the input n
             */
            mov      ebx, n;


            /**
             *----Perform a table lookup on the leading 7 bits of n to obtain the 8
             *----fractional bits of a 24.8 fixed point estimate to rsq(n) and add
             *----the integer portion of the 24.8 fixed point estimate (i.e., 0x100),
             *----which always has the mathematical value 1. The result is a 24.8
             *----fixed point estimate to rsq(n).
             */
            mov      edi, ebx; 
            shr      edi, 25;
            sub      edi, 32;
            mov      eax, 0;
            mov      al,  rsqTable[edi];
            add      eax, 0x100;


            /**
             *----Let x0 be the current value of eax, which is a 24.8 fixed point
             *----estimate to rsq(n). Copy eax to esi.
             */
            mov      esi, eax;


            /**
             *----Begin the first Newton-Raphson iteration. Compute the 16.16 fixed
             *----point value [edx:eax] = (x0 * x0).
             */
            imul  eax;

        
            /**
             *----Convert n from a 0.32 fixed point value to a 17.15 fixed point
             *----value and compute the 17.15 fixed point value (n * x0 * x0).
             */
            mov      edi, ebx;
            shr      edi, 17;
            imul  edi;    
            shr      eax, 16;
            shl      edx, 16;
            or      eax, edx;

        
            /**
             *----Convert x0 (i.e., esi) from a 24.8 fixed point value to a 17.15
             *----fixed point value
             */
            shl      esi, 7;


            /**
             *----Compute the 17.15 fixed point value (3 - (n * x0 * x0))
             */
            sub      eax, 0x18000;
            neg      eax;

        
            /**
             *----Compute the 1.31 fixed point value (0.5 * x0 * (3 - (n * x0 *
             *----x0))). Note that the multiplication below of two 17.15 fixed point
             *----values produces a 64-bit 34.30 fixed point value. Only the low 32
             *----bits of the 64-bit product are kept, thereby truncating the 34.30
             *----fixed point value to a 2.30 fixed point value. The multiplication
             *----by 0.5 effectively shifts the binary point to the left by one bit,
             *----which makes x1 a 1.31 fixed point value. This is the end of the
             *----first Newton-Raphson iteration.
             */
            imul  eax, esi;

        
            /**
             *----Let x1 be the current value of eax, which is a 1.31 fixed point
             *----estimate to rsq(n). Copy eax to esi.
             */
            mov      esi, eax;

        
            /**
             *----Begin the second Newton-Raphson iteration. Compute the 1.31 fixed
             *----point value (n * x1).
             */
            mul      ebx;
            mov      eax, edx;

        
            /**
             *----Compute the 2.30 fixed point value (n * x1 * x1)
             */
            mul      esi;

        
            /**
             *----Compute the 2.30 fixed point value (3 - (n * x1 * x1))
             */
            mov      eax, edx;
            sub      eax, 0xc0000000;
            neg      eax;


            /**
             *----Compute the 2.30 fixed point value (0.5 * x1 * (3 - (d * x1 *
             *----x1))). Note that the multiplication below of a 1.31 fixed point
             *----value and a 2.30 fixed point value produces a 64-bit 3.61 fixed
             *----point value. Only the high 32 bits of the 64-bit product are kept,
             *----thereby truncating the 3.61 fixed point value to a 3.29 fixed point
             *----value. The multiplication by 0.5 effectively shifts the binary
             *----point to the left by one bit, which makes x2 a 2.30 fixed point
             *----value. This is the end of the second Newton-Raphson iteration.
             */
            mul      esi;
            mov      eax, edx;
        }
    }
#endif
}


/**
 *-----------------------------------------------------------------------------------
 *    FIXED POINT IMPLEMENTATION NOTES ON COMPUTING RECIPROCAL SQUARE ROOTS
 *-----------------------------------------------------------------------------------
 *    The RSQ() function (see above) computes the reciprocal square root of a 0.32
 *    fixed point value that lies in the mathematical range [0.25, 1). In other words,
 *    RSQ() is a specialized function that only handles a restricted set of fixed point
 *    input values. This section describes how to use RSQ() to compute reciprocal
 *    square roots of general fixed point values.
 *
 *    First, consider the following algorithm for computing the reciprocal square root
 *    of any (real-valued) number M:
 *
 *      1. Normalize M to the range [0.25, 1) by choosing a normalization exponent e
 *         such that (M / (2^e)) lies in the range [0.25, 1).
 *
 *      2. Compute the reciprocal square root R of the normalized value of M (i.e.,
 *         compute R = rsqrt(M / 2^e) = 1 / sqrt(M / (2^e)) = sqrt(2^e / M).
 *
 *      3. Compute the reciprocal square root of M by unnormalizing R (i.e., compensate
 *         for the normalization of M in step 1). To unnormalize R, note that R =
 *         sqrt(2^e / M) = sqrt(2^e) / sqrt(M) = 2^(e/2) * rsqrt(M). It follows that
 *         rsqrt(M) = R * 2^(-e/2). Therefore, computing the reciprocal square root of
 *         M simply requires multiplying R by 2^(-e/2).
 *
 *    This algorithm can be translated to a fixed point implementation as follows.
 *
 *    Suppose that the value M in the above algorithm is actually the mathematical
 *    value of a 32-bit I.F fixed point value N (i.e., N has I integer bits and F
 *    fractional bits, where I + F = 32 and M = (N / (2^F))). The following three
 *    steps compute the reciprocal square root of N:
 *
 *      a. Normalize N by shifting N to the left by s bits, where s is an integer
 *         chosen so that (N * 2^s) is at least 2^30. Note that this is equivalent to
 *         choosing s such that (N * 2^s) is a 0.32 fixed point value that lies in the
 *         mathematical range [0.25, 1) (i.e., 0.25 <= (N * 2^s / 2^32) < 1). The
 *         relationship between s and the exponent e (see step 1 above) will be
 *         explained in step c below.
 *
 *      b. Compute the reciprocal square root of the normalized value of N by calling
 *         the RSQ() function with (N * 2^s) as its argument. The result R is a 2.30
 *         fixed point value that represents the reciprocal square root of the
 *         normalized value of N.
 *
 *      c. Unnormalize R by shifting R to the right by (16 - 0.5 * (F + s)) bits. To
 *         derive this formula, recall from step 3 that unnormalizing R requires
 *         multiplying R by 2^(-e/2), where e is the normalization exponent chosen so
 *         that (M / (2^e)) lies in the range [0.25, 1). Note that the normalized value
 *         (M / (2^e)) is equal to the normalized value (N * 2^s / 2^32) (see step a
 *         above):
 *
 *           M / 2^e = N * 2^s / 2^32
 *                   = N * 2^(s-32)
 *                   = M * 2^F * 2^(s-32)
 *                   = M * 2^(F+s-32)
 *                   = M * 2^(F+s-32)
 *
 *         Note that the third equation above uses the fact that N = M * 2^F. By
 *         equating the exponents, it follows that e = 32 - F - s, where F is the
 *         number of fractional bits of the fixed point value N, and s is the integer
 *         chosen above in step a. Multiplication by 2^(-e/2) can be implemented as a
 *         right shift by e/2 = (16 - 0.5 * (F + s)) bits.
 *
 *    Note that the functions I2408_RSQ_I1616() and I1616_RSQ() compute their results
 *    as 16.16 fixed point values. Since the RSQ() function computes its result as a
 *    2.30 fixed point value, the conversion to a 16.16 fixed point value requires an
 *    additional right shift of 14 bits.
 *
 *    For the I2408_RSQ_I1616() function, the input N is a 24.8 fixed point value
 *    (i.e., I = 24, F = 8). Therefore, the computed 2.30 fixed point reciprocal square
 *    root R is unnormalized and converted to a 16.16 fixed point value by shifting R
 *    to the right by ((16 - (0.5 * (8 + s))) + 14) = (26 - (s / 2)) bits.
 *
 *    For the I1616_RSQ() function, the input N is a 16.16 fixed point value (i.e., I =
 *    16, F = 16). Therefore, the computed 2.30 fixed point reciprocal square root R is
 *    unnormalized and converted to a 16.16 fixed point value by shifting R to the
 *    right by ((16 - (0.5 * (16 + s))) + 14) = (22 - (s / 2)) bits.
 *
 *    Since I2408_RSQ_I1616() and I1616RSQ() must divide s by 2 in step c, both
 *    functions choose an even value of s in step a, thereby allowing the division of
 *    s by 2 to be implemented as a right shift (i.e., (s >> 1)).
 *-----------------------------------------------------------------------------------
 */


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the reciprocal square root of n (i.e., 1 / sqrt(n)), where the
 *    input n is an ADF_I2408 fixed point value and the computed value is an ADF_I1616
 *    fixed point value. The computed value is rounded towards negative infinity. If
 *    the input n is less than or equal to zero, the result is undefined.
 *
 *    All assembly and C implementations produce bit-identical results, except in the
 *    case that the input n is zero.
 *
 *    Implementation notes:
 *
 *      - I2408_RSQ_I1616() computes the reciprocal square root of n using the approach
 *        described in the section Fixed Point Implementation Notes On Computing
 *        Reciprocal Square Roots (see above).
 *
 *      - The documentation for the C implementation below uses a running example to
 *        help explain the fixed point implementation. This running example considers
 *        the case where the input value is n = 0x4000 (i.e., the mathematical value
 *        (0x4000 / 2^8) = 64).
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I2408_RSQ_I1616 (ADF_I2408 n)
{
    ADF_U32 rsq;

        
    /**
     *----Set b to the input n
     */
    ADF_U32 b = *(ADF_U32 *) &n;


    /**
     *----Determine an even integer s such that the expression (b << s) is a 0.32
     *----fixed point value that lies in the mathematical range [0.25, 1). Running
     *----example: consider the case where b is the 24.8 fixed point value 0x4000,
     *----which has the mathematical value (0x4000 / 2^8) = 64.
     *----CountLeadingZeroes(0x4000) is 17, because 0x4000 contains 17 zeroes before
     *----its leading 1 bit. Therefore, s = (17 & 0xfffffffe) = 16.
     */
    ADF_U32 s = (CountLeadingZeroes(b) & 0xfffffffe);


    /**
     *----Shift b to the left by s bits so that b is a 0.32 fixed point value that
     *----lies in the mathematical range [0.25, 1). Running example: b = (b << s) =
     *----(0x4000 << 16) = 0x40000000, which is a 0.32 fixed point with the
     *----mathematical value (0x40000000 / 2^32) = 0.25.
     */
    b <<= s;


    /**
     *----Compute the reciprocal square root of b. Note that the RSQ() function
     *----computes the result as a 2.30 fixed point value. Running example: rsq =
     *----RSQ(b) = RSQ(0x40000000) = 0x7fffffff, which has the mathematical value
     *----(0x7fffffff / 2^30) = 1.999999999068677425384521484375.
     */
    rsq = RSQ(b);


    /**
     *----Unnormalize rsq and convert the result to a 16.16 fixed point value by
     *----shifting rsq to the right by (26 - s/2) bits (see the section Fixed Point
     *----Implementation Notes On Computing Reciprocal Square Roots for the
     *----derivation of this formula). Running example: rsq >> (26 - (s >> 1)) =
     *----0x7fffffff >> (26 - (16 >> 1)) = 0x7fffffff >> 18 = 0x1fff, which is a
     *----16.16 fixed point value with the mathematical value (0x1fff / 2^16) =
     *----0.1249847412109375. Note that the true reciprocal square root of 64 is 1/8
     *----= 0.125.
     */
    rsq >>= (26 - (s >> 1));
        

    /**
     *----Return the result
     */
    return(*(ADF_I1616*) &rsq);
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the reciprocal square root of f (i.e., 1 / sqrt(f)), where the
 *    input n and the computed result are ADF_I1616 fixed point values. The computed
 *    value is rounded towards negative infinity. If the input n is less than or equal
 *    to zero, the result is undefined.
 *
 *    All assembly and C implementations produce bit-identical results, except in the
 *    case that the input n is zero.
 *
 *    Implementation notes:
 *
 *      - I1616_RSQ() computes the reciprocal square root of n using the approach
 *        described in the section Fixed Point Implementation Notes On Computing
 *        Reciprocal Square Roots (see above).
 *
 *      - The documentation for the C implementation below uses a running example to
 *        help explain the fixed point implementation. This running example considers
 *        the case where the input value is n = 0x4000 (i.e., the mathematical value
 *        (0x4000 / 2^16) = 1/4 = 0.25).
 *
 *    Performance notes:
 *
 *    Intel Centrino Core Duo T2500 (2 MB L2, 2.0 GHz, FSB 677 MHz), MSVC 6 compiler,
 *    Release mode:
 *      - ADF_MATH_FIXED_C_64       is ~1.7x as fast as ADF_MATH_FIXED_C_32
 *      - ADF_MATH_FIXED_ASM_X86 is ~1.3x as fast as ADF_MATH_FIXED_C_64
 *      - ADF_MATH_FIXED_ASM_X86 is ~2.2x as fast as ADF_MATH_FIXED_C_32
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I1616_RSQ (ADF_I1616 n)
{
    ADF_U32 rsq;

        
    /**
     *----Set b to the input n
     */
    ADF_U32 b = *(ADF_U32 *) &n;


    /**
     *----Determine an even integer s such that the expression (b << s) is a 0.32
     *----fixed point value that lies in the mathematical range [0.25, 1). Running
     *----example: consider the case where b is the 16.16 fixed point value 0x4000,
     *----which has the mathematical value (0x4000 / 2^16) = 1/4 = 0.25.
     *----CountLeadingZeroes(0x4000) is 17, because 0x4000 contains 17 zeroes before
     *----its leading 1 bit. Therefore, s = (17 & 0xfffffffe) = 16.
     */
    ADF_U32 s = (CountLeadingZeroes(b) & 0xfffffffe);


    /**
     *----Shift b to the left by s bits so that b is a 0.32 fixed point value that
     *----lies in the mathematical range [0.25, 1). Running example: b = (b << s) =
     *----(0x4000 << 16) = 0x40000000, which is a 0.32 fixed point with the
     *----mathematical value (0x40000000 / 2^32) = 0.25.
     */
    b <<= s;


    /**
     *----Compute the reciprocal square root of b. Note that the RSQ() function
     *----computes the result as a 2.30 fixed point value. Running example: rsq =
     *----RSQ(b) = RSQ(0x40000000) = 0x7fffffff, which has the mathematical value
     *----(0x7fffffff / 2^30) = 1.999999999068677425384521484375.
     */
    rsq = RSQ(b);


    /**
     *----Unnormalize rsq and convert the result to a 16.16 fixed point value by
     *----shifting rsq to the right by (22 - s/2) bits (see the section Fixed Point
     *----Implementation Notes On Computing Reciprocal Square Roots for the
     *----derivation of this formula). Running example: rsq >> (22 - (s >> 1)) =
     *----0x7fffffff >> (22 - (16 >> 1)) = 0x7fffffff >> 14 = 0x1ffff, which is a
     *----16.16 fixed point value with the mathematical value (0x1ffff / 2^16) =
     *----1.9999847412109375. Note that the true reciprocal square root of 0.25 is 2.
     */
    rsq >>= (22 - (s >> 1));
        

    /**
     *----Return the result
     */
    return(*(ADF_I1616*) &rsq);
}


/**
 *-----------------------------------------------------------------------------------
 *    Normalize the 2D vector (nx, ny) using high-precision arithmetic and return the
 *    result in (nxOut, nyOut). The inputs nx and ny and the outputs nxOut and nyOut
 *    are ADF_I1616 fixed point values. The following six cases are possible:
 *
 *      1) If nx and ny are both zero (i.e., the vector has zero length),
 *         I1616_NORMALIZE() returns zero and the contents of nxOut and nyOut are
 *         undefined. 
 *
 *      2) If nx is zero and ny is positive, I1616_NORMALIZE() returns one, stores zero
 *         into nxOut, and stores the ADF_I1616 representation of 1 (i.e., 0x00010000)
 *         into nyOut.
 *
 *      3) If nx is zero and ny is negative, I1616_NORMALIZE() returns one, stores zero
 *         into nxOut, and stores the ADF_I1616 representation of -1 (i.e., 0xffff0000)
 *         into nyOut.
 *
 *      4) If nx is positive and ny is zero, I1616_NORMALIZE() returns one, stores the
 *         ADF_I1616 representation of 1 (i.e., 0x00010000) into nxOut, and stores zero
 *         into nyOut.
 *
 *      5) If nx is negative and ny is zero, I1616_NORMALIZE() returns one, stores the
 *         ADF_I1616 representation of -1 (i.e., 0xffff0000) into nxOut, and stores
 *         zero into nyOut.
 *
 *      6) If nx and ny are both non-zero, I1616_NORMALIZE() returns one, stores the x
 *         component of the normalized vector into nxOut, and stores the y component of
 *         the normalized vector into nyOut. 
 *
 *    All assembly and C implementations produce bit-identical results for nxOut and
 *    nyOut, except in the case where inputs nx and ny are both zero (in which case the
 *    contents of nxOut and nyOut are undefined).
 *
 *    Implementation notes:
 *
 *      - When both components of the input vector (nx, ny) are non-zero, the vector
 *        (nx, ny) is normalized using the following steps:
 *
 *          1. The squared length of the vector (nx, ny) is computed using 64-bit
 *             integer arithmetic and stored as a 64-bit 32.32 fixed point value.
 *
 *          2. The squared length is normalized to a 0.64 fixed point value that lies
 *             in the mathematical range [0.25, 1). This normalization step is
 *             implemented by shifting the squared length so that the leading 1 is in
 *             bit position 63 or 62 (i.e., the leading 1 is one of the two high two
 *             bits of the 64-bit value). After this step, the low 32 bits of the 0.64
 *             fixed point value are truncated (i.e., cleared to zero).
 *
 *          3. The reciprocal square root of the normalized squared length is computed
 *             using the RSQ() function. The result is a 2.30 fixed point estimate of
 *             the inverse length of the vector.
 *
 *          4. nx and ny are multiplied by the inverse length of the vector using
 *             64-bit integer arithmetic. The normalized vector components are
 *             converted to ADF_I1616 fixed point values and stored in (nxOut, nyOut).
 *
 *          5. When the x and y components of the normalized vector are converted to
 *             ADF_I1616 fixed point values, excess fractional bits are discarded,
 *             effectively rounding both components towards negative infinity.
 *
 *    Performance notes:
 *
 *    Intel Centrino Core Duo T2500 (2 MB L2, 2.0 GHz, FSB 677 MHz), MSVC 6 compiler,
 *    Release mode:
 *      - ADF_MATH_FIXED_C_64       is ~1.4x as fast as ADF_MATH_FIXED_C_32
 *      - ADF_MATH_FIXED_ASM_X86 is ~1.6x as fast as ADF_MATH_FIXED_C_64
 *      - ADF_MATH_FIXED_ASM_X86 is ~2.2x as fast as ADF_MATH_FIXED_C_32
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I1616_NORMALIZE (ADF_I1616 nx, ADF_I1616 ny, ADF_I1616 *nxOut,
ADF_I1616 *nyOut)
{
#if (ADF_MATH_MODE != ADF_MATH_FIXED_ASM_X86)
    {
        /**
         *----C implementation (32-bit and 64-bit)
         */
        ADF_U32 s;
        ADF_U32 lenSqr;
        ADF_U32 invLen;
        ADF_U32 sumHigh, sumLow;
        ADF_U32 nxnxHigh, nxnxLow;
        ADF_U32 nynyHigh, nynyLow;
        ADF_U32 nxNormHigh, nxNormLow;
        ADF_U32 nyNormHigh, nyNormLow;
        ADF_U32 nxNorm, nyNorm;


        /**
         *----Determine the signs of the normal components nx and ny
         */
        ADF_I32 xSign = (nx < 0);
        ADF_I32 ySign = (ny < 0);


        /**
         *----The squared length of (nx, ny) is zero if and only if nx and ny are
         *----both zero. If both nx and ny are zero, return zero to indicate that the
         *----length of the input vector is zero.
         */
        if (!nx && !ny) return(0);


        /**
         *----Set nx to abs(nx) and set ny to abs(ny)
         */
        if (xSign) nx = -nx;
        if (ySign) ny = -ny;


        /**
         *----Determine if nx is zero
         */
        if (!nx) {


            /**
             *----nx is zero. If y is negative, set (nxOut, nyOut) to (0, -1).
             *----Otherwise, set (nxOut, nyOut) to (0, 1). Return 1 to indicate that
             *----the length of the input vector is non-zero.
             */
            *nxOut = 0;
            *nyOut = (ySign) ? 0xffff0000 : 0x00010000;
            return(1);
        }


        /**
         *----Determine if ny is zero
         */
        if (!ny) {


            /**
             *----ny is zero. If x is negative, set (nxOut, nyOut) to (-1, 0).
             *----Otherwise, set (nxOut, nyOut) to (1, 0). Return 1 to indicate that
             *----the length of the input vector is non-zero.
             */
            *nxOut = (xSign) ? 0xffff0000 : 0x00010000;
            *nyOut = 0;
            return(1);
        }


        /**
         *----nx and ny are both non-zero. Compute the 64-bit 32.32 fixed point value
         *----(nx * nx) and the 64-bit 32.32 fixed point value (ny * ny).
         */
        UMUL64(nx, nx, &nxnxHigh, &nxnxLow);
        UMUL64(ny, ny, &nynyHigh, &nynyLow);


        /**
         *----Compute the 64-bit 32.32 fixed point sum ((nx * nx) + (ny * ny)). Note
         *----that the expression (sumLow < nxnxLow) is the carry bit of sumLow.
         */
        /*lint -e514  Warning 514: Unusual use of a Boolean expression */
        sumLow = nxnxLow + nynyLow;
        sumHigh = nxnxHigh + nynyHigh + (sumLow < nxnxLow);
        /*lint +e514  Warning 514: Unusual use of a Boolean expression */
    

        /**
         *----Determine if the 64-bit 32.32 fixed point sum contains a 1 bit in the
         *----high 32 bits
         */
        if (sumHigh) {


            /**
             *----At least one of the high 32 bits of the sum is 1. Choose an integer
             *----s such that the 64-bit sum multiplied by 2^s is at least 2^62.
             *----Since at least one of the high 32 bits of the sum is 1, use only
             *----the high 32 bits to compute the number of leading zeroes. Then
             *----clear the LSB, thereby ensuring that s an even number. An even
             *----value of s is chosen to ensure that (s / 2) is an integer; an
             *----integral value of the expression (s / 2) is needed below to
             *----implement the division of s by 2 as a right shift.
             */
            s = CountLeadingZeroes(sumHigh) & 0xfffffffe;


            /**
             *----Normalize the 64-bit 32.32 fixed point sum [sumHigh:sumLow] to a
             *----0.64 fixed point value that lies in the mathematical range [0.25,
             *----1). By storing the 64-bit result as a 32-bit integer, the low 32
             *----bits of the 0.64 fixed point value are truncated (i.e., effectively
             *----set to zero). Considered as an unsigned integer, lenSqr has its
             *----leading 1 in bit 31 or bit 30 (i.e., in one of the high two bits of
             *----the 32-bit integer) and thus is at least 2^30. Implementation note:
             *----s is an even integer that lies in the range [0, 30] (note that s
             *----cannot be 32 because sumHigh contains at least one 1 bit). If s is
             *----zero, then the right shift below (i.e., sumLow >> (32 - s)) becomes
             *----a right shift of 32 bits. However, the ANSI C specification states
             *----that if an operand of size N bits is shifted by an amount M such
             *----that M >= N, then the result is undefined (and hence not portable).
             *----In this case, sumLow is a 32-bit integer, so shifting it by 32 bits
             *----produces an undefined result. In practice, this issue is irrelevant
             *----for the following reason. If s is zero, then either bit 31 (i.e.,
             *----the MSB) or bit 30 of sumHigh is 1, which implies that the squared
             *----length of the vector (nx, ny) is at least 2^62. Since nx and ny are
             *----positive signed integers, however, the MSBs of both nx and ny must
             *----be zero. Therefore, the squared length of (nx, ny) must be less
             *----than 2^62, which proves that s cannot be zero.
             */
            lenSqr = (sumHigh << s) | (sumLow >> (32 - s));


        } else {


            /**
             *----The high 32 bits of the sum are zero. Choose an integer s such that
             *----the 64-bit sum multiplied by 2^s is at least 2^62. Since the high
             *----32 bits of the sum are zero, use only the low 32 bits to compute
             *----the number of leading zeroes of the sum. Then clear the LSB,
             *----thereby ensuring that s is an even number. An even value of s is
             *----chosen to ensure that (s / 2) is an integer; an integral value of
             *----the expression (s / 2) is needed below to implement the division of
             *----s by 2 as a right shift.
             */
            s = CountLeadingZeroes(sumLow) & 0xfffffffe;


            /**
             *----Normalize the 64-bit 32.32 fixed point sum [sumHigh:sumLow] to a
             *----0.64 fixed point value that lies in the mathematical range [0.25,
             *----1). By storing the 64-bit result as a 32-bit integer, the low 32
             *----bits of the 0.64 fixed point value are truncated (i.e., effectively
             *----set to zero). Considered as an unsigned integer, lenSqr has its
             *----leading 1 in bit 31 or bit 30 (i.e., in one of the high two bits of
             *----the 32-bit integer) and thus is at least 2^30.
             */
            lenSqr = (sumLow << s);


            /**
             *----Add 32 to the normalization shift amount s because the high 32 bits
             *----of the 64-bit sum are zero
             */
            s += 32;
        }


        /**
         *----Compute the reciprocal square root of the squared length of the vector
         *----by calling the RSQ() function. Note that RSQ() requires that its input
         *----be a 32-bit 0.32 fixed point value that lies in the mathematical range
         *----[0.25, 1). lenSqr actually represents a 64-bit 0.64 fixed point value
         *----that lies in the mathematical range [0.25, 1), but only the high 32
         *----bits of this 64-bit value are stored (i.e., the low 32 fractional bits
         *----are zero). RSQ() returns a 2.30 fixed point value, which is effectively
         *----a 2.62 fixed point value with the low 32 fractional bits set to zero.
         */
        invLen = RSQ(lenSqr);


        /**
         *----Multiply (nx, ny) by the inverse length of the vector. Store the high
         *----32 bits of the vector in (nxNormHigh, nyNormHigh) and store the low 32
         *----bits of the vector in (nxNormLow, nyNormLow).
         */
        UMUL64(nx, invLen, &nxNormHigh, &nxNormLow);
        UMUL64(ny, invLen, &nyNormHigh, &nyNormLow);


        /**
         *----Unnormalize the vector components and convert the results to 16.16
         *----fixed point values by shifting the vector components to the right by
         *----(46 - (s / 2)) bits. This expression is obtained as follows. First,
         *----unnormalizing the vector components can be accomplished by shifting
         *----them to the right by (16 - 0.5 * (F + s)) bits, where F is the number
         *----of fractional bits of the squared length of the vector, and s is the
         *----normalization shift determined above (see the section Fixed Point
         *----Implementation Notes On Computing Reciprocal Square Roots for the
         *----derivation of this formula). In this case, the squared length of the
         *----vector is represented as a 64-bit 32.32 fixed point value, so there are
         *----32 fractional bits (i.e., F = 32). Therefore, the unnormalization step
         *----requires shifting the vector components to the right by (16 - 0.5 * (32
         *----+ s)) = 16 - 16 - s/2 = -s/2 bits. Finally, converting from a 2.62
         *----fixed point value to a 16.16 fixed point value requires a right shift
         *----of 46 bits. Therefore, the vector components need to be shifted to the
         *----right by a total of (46 - (s / 2)) bits.
         */
        s = 46 - (s >> 1);


        /**
         *----Compare the shift amount to 32
         */
        if (s >= 32) {


            /**
             *----The shift amount is at least 32. Therefore, it is sufficient to
             *----shift only the high 32 bits, because the low 32 bits are completely
             *----shifted off the right end.
             */
            s -= 32;
            nxNorm = nxNormHigh >> s;
            nyNorm = nyNormHigh >> s;


            /**
             *----If the x component is negative, negate the unsigned component of
             *----the normalized vector
             */
            if (xSign) {
                nxNorm = -((ADF_I32) nxNorm);


                /**
                 *----If a 1 bit was shifted off during unnormalization and
                 *----conversion from a 2.62 fixed point value to a 16.16 fixed point
                 *----value, subtract 1 from nxNorm to round the 16.16 fixed point
                 *----value towards negative infinity. To see why it is necessary to
                 *----subtract 1, consider rounding the value -1.5 towards negative
                 *----infinity. Simply dropping the fractional portion produces the
                 *----value -1.0, which has the effect of rounding -1.5 towards zero.
                 *----However, dropping the fractional portion and then subtracting 1
                 *----(i.e., -1.0 - 1 = -2.0) produces the desired result of rounding
                 *-----1.5 towards negative infinity.
                 */
                if ((nxNormHigh & ((1 << s) - 1)) || (nxNormLow)) nxNorm -= 1;
            }

            
            /**
             *----If the y component is negative, negate the unsigned component of
             *----the normalized vector
             */
            if (ySign) {
                nyNorm = -((ADF_I32) nyNorm);


                /**
                 *----If a 1 bit was shifted off during unnormalization and
                 *----conversion from a 2.62 fixed point value to a 16.16 fixed point
                 *----value, subtract 1 from nyNorm to round the 16.16 fixed point
                 *----value towards negative infinity. To see why it is necessary to
                 *----subtract 1, consider rounding the value -1.5 towards negative
                 *----infinity. Simply dropping the fractional portion produces the
                 *----value -1.0, which has the effect of rounding -1.5 towards zero.
                 *----However, dropping the fractional portion and then subtracting 1
                 *----(i.e., -1.0 - 1 = -2.0) produces the desired result of rounding
                 *-----1.5 towards negative infinity.
                 */
                if ((nyNormHigh & ((1 << s) - 1)) || (nyNormLow)) nyNorm -= 1;
            }


        } else {


            /**
             *----The shift amount is less than 32. Therefore, it is necessary to
             *----merge the high 32 bits and the low 32 bits of the 64-bit, 2.62
             *----fixed point result into a single 32-bit, 16.16 fixed point value.
             */
            nxNorm = ((nxNormHigh << (32 - s)) | (nxNormLow >> s));
            nyNorm = ((nyNormHigh << (32 - s)) | (nyNormLow >> s));


            /**
             *----If the x component is negative, negate the unsigned component of
             *----the normalized vector
             */
            if (xSign) {
                nxNorm = -((ADF_I32) nxNorm);


                /**
                 *----If a 1 bit was shifted off during unnormalization and
                 *----conversion from a 2.62 fixed point value to a 16.16 fixed point
                 *----value, subtract 1 from nxNorm to round the 16.16 fixed point
                 *----value towards negative infinity. To see why it is necessary to
                 *----subtract 1, consider rounding the value -1.5 towards negative
                 *----infinity. Simply dropping the fractional portion produces the
                 *----value -1.0, which has the effect of rounding -1.5 towards zero.
                 *----However, dropping the fractional portion and then subtracting 1
                 *----(i.e., -1.0 - 1 = -2.0) produces the desired result of rounding
                 *-----1.5 towards negative infinity.
                 */
                if (nxNormLow & ((1 << s) - 1)) nxNorm -= 1;
            }

            
            /**
             *----If the y component is negative, negate the unsigned component of
             *----the normalized vector
             */
            if (ySign) {
                nyNorm = -((ADF_I32) nyNorm);


                /**
                 *----If a 1 bit was shifted off during unnormalization and
                 *----conversion from a 2.62 fixed point value to a 16.16 fixed point
                 *----value, subtract 1 from nyNorm to round the 16.16 fixed point
                 *----value towards negative infinity. To see why it is necessary to
                 *----subtract 1, consider rounding the value -1.5 towards negative
                 *----infinity. Simply dropping the fractional portion produces the
                 *----value -1.0, which has the effect of rounding -1.5 towards zero.
                 *----However, dropping the fractional portion and then subtracting 1
                 *----(i.e., -1.0 - 1 = -2.0) produces the desired result of rounding
                 *-----1.5 towards negative infinity.
                 */
                if (nyNormLow & ((1 << s) - 1)) nyNorm -= 1;
            }
        }


        /**
         *----Store the signed components of the normalized vector
         */
        *nxOut = nxNorm;
        *nyOut = nyNorm;


        /**
         *----Return 1 to indicate that the length of the input vector is non-zero
         */
        return(1);
    }
#elif (ADF_MATH_MODE == ADF_MATH_FIXED_ASM_X86)
    {
        /**
         *----x86 assembly implementation
         */
        ADF_I32 returnValue;


        /**
         *----x86 assembly implementation
         */
        __asm {


            /**
             *----Set edi to nx and set ebx to ny
             */
            mov      edi, nx;
            mov      ebx, ny;


            /**
             *----Check the processor flags for nx (i.e., edi)
             */
            test  edi, edi;

        
            /**
             *----If nx (i.e., edi) is non-zero (i.e., processor flag ZF == 0), jump
             *----to nxNotZero
             */
            jne      nxNotZero;


            /**
             *----nx (i.e., edi) is zero. Check the processor flags for ny (i.e.,
             *----ebx).
             */
            test  ebx, ebx;


            /**
             *----If ny (i.e., ebx) is zero (i.e., processor flag ZF == 1), then both
             *----nx and ny are zero; jump to degenerate
             */
            je      degenerate;


            /**
             *----nx is zero and ny is non-zero. Therefore, the unit vector is either
             *----(0, -1) or (0, +1). Set the x component of the final unit vector to
             *----zero.
             */
            mov      nx, 0;


            /**
             *----The processor flags still reflect the state of ny (formerly ebx).
             *----ny is non-zero. If ny is negative (i.e., the processor flag SF ==
             *----1), then jump to downwardUnitVector.
             */
            js      downwardUnitVector;


            /**
             *----ny is positive (i.e., processor flag SF == 0), so the unit vector
             *----is (0, +1). Set the y component of the final unit vector to 1 and
             *----jump to nonDegenerate.
             */
            mov      ny, 0x00010000;
            jmp      nonDegenerate;


            /**
             *----ny is negative (i.e., processor flag SF == 1), so the unit vector
             *----is (0, -1). Set the y component of the final unit vector to -1 and
             *----jump to nonDegenerate.
             */
            downwardUnitVector:
            mov      ny, 0xffff0000;
            jmp      nonDegenerate;


            /**
             *----nx is non-zero. Check the processor flags for ny (i.e., ebx).
             */
            nxNotZero:
            test  ebx, ebx;


            /**
             *----If ny (i.e., ebx) is zero (i.e., processor flag ZF == 1), jump to
             *----nyZero
             */
            je      nyZero;


            /**
             *----nx and ny are both non-zero. All of the checks for special cases
             *----are complete. edi contains the input value of nx, and ebx contains
             *----the input value of ny. The processor flags reflect the state of ny
             *----(i.e., ebx). Compute the 64-bit square of nx (i.e., edi * edi). The
             *----high and low 32 bits of the 64-bit product are stored in edx and
             *----eax, respectively.
             */
            mov      eax, edi;
            imul  eax;
        

            /**
             *----Copy the square of nx from registers [edx:eax] to registers
             *----[edi:esi]
             */
            mov      edi, edx;
            mov      esi, eax;


            /**
             *----Compute the 64-bit square of ny (i.e., ebx * ebx). The high and low
             *----32 bits of the product are stored in edx and eax, respectively.
             */
            mov      eax, ebx;
            imul  eax;


            /**
             *----Compute the 64-bit sum of (nx * nx) (i.e., [edi:esi]) and (ny * ny)
             *----(i.e., [edx:eax]) and store the result into [edi:esi]. Note that
             *----the second instruction (i.e., adc) determines the processor flags
             *----for the high 32 bits of the 64-bit sum.
             */
            add      esi, eax;
            adc      edi, edx;


            /**
             *----The goal is to normalize the 64-bit 32.32 fixed point sum to a 0.64
             *----fixed point value that lies in the mathematical range [0.25, 1).
             *----This can be accomplished by choosing an integer s such that the sum
             *----multiplied by 2^s is at least 2^62. It is convenient to choose an
             *----even value for s because an integral value of (s / 2) is needed
             *----later. First, initialize ecx to 31.
             */
            mov      ecx, 31;


            /**
             *----If the high 32 bits of the 64-bit sum have a value of zero (i.e.,
             *----processor flag ZF == 1), then jump to sumHighClear
             */
            je      sumHighClear;


            /**
             *----At least one of the high 32 bits of the 64-bit sum is 1. Therefore,
             *----use only the high 32 bits (i.e., edi) to compute the number of
             *----leading zeroes. Store the number of leading zeroes in ecx.
             */
            bsr      eax, edi;
            sub      ecx, eax;


            /**
             *----Clear the LSB of the normalization shift s (i.e., ecx), thereby
             *----making s an even number. Store a copy of s in ebx.
             */
            and      ecx, 0xfffffffe;
            mov      ebx, ecx;


            /**
             *----Normalize the 64-bit 32.32 fixed point sum [sumHigh:sumLow] to a
             *----0.64 fixed point value that lies in the mathematical range [0.25,
             *----1). By storing the 64-bit result as a 32-bit integer, the low 32
             *----bits of the 0.64 fixed point value are truncated (i.e., effectively
             *----set to zero). Considered as an unsigned integer, lenSqr (i.e., edi)
             *----has its leading 1 in bit 31 or bit 30 (i.e., in one of the high two
             *----bits of the 32-bit integer) and thus is at least 2^30.
             */
            shl      edi, cl;
            mov      ecx, 32;
            sub      ecx, ebx;
            shr      esi, cl;


            /**
             *----Merge the two 32-bit integers into a single 32-bit integer which
             *----represents the high 32 bits of the 0.64 fixed point sum of the
             *----squares of nx and ny. After this operation, esi holds ((nx * nx) +
             *----(ny * ny)).
             */
            or      esi, edi;


            /**
             *----Jump to the RSQ step
             */
            jmp      rsqStep;


            /**
             *----The high 32 bits of the sum are zero. Therefore, use only the low
             *----32 bits to compute the number of leading zeroes of the sum. Store
             *----the number of leading zeroes in ecx.
             */
            sumHighClear:
            bsr      eax, esi;
            sub      ecx, eax;


            /**
             *----Clear the LSB of the normalization shift s (i.e., ecx), thereby
             *----making s an even number. Store (s + 32) in ebx (32 is added to the
             *----shift amount because the high 32 bits of the 64-bit sum are zero).
             */
            and      ecx, 0xfffffffe;
            mov      ebx, 32;
            add      ebx, ecx;


            /**
             *----Normalize the 64-bit 32.32 fixed point sum [sumHigh:sumLow] to a
             *----0.64 fixed point value that lies in the mathematical range [0.25,
             *----1). By storing the 64-bit result as a 32-bit integer, the low 32
             *----bits of the 0.64 fixed point value are truncated (i.e., effectively
             *----set to zero). Considered as an unsigned integer, lenSqr (i.e., edi)
             *----has its leading 1 in bit 31 or bit 30 (i.e., in one of the high two
             *----bits of the 32-bit integer) and thus is at least 2^30.
             */
            shl      esi, cl;


            /**
             *----Compute the reciprocal square root of the squared length of the
             *----vector by calling the RSQ() function. The result is the inverse
             *----length of the input vector and it will be stored in edi.
             */
            rsqStep:
            mov      eax, esi;
            push  eax;
            call  RSQ;
            mov      edi, eax;
            pop      eax;


            /**
             *----Unnormalize the vector components nx and ny and convert the results
             *----to 16.16 fixed point values by shifting nx and ny to the right by
             *----(46 - (s / 2)) bits. See the C implementation above for the
             *----derivation of this formula.
             */
            shr      ebx, 1;
            mov      ecx, 46;
            sub      ecx, ebx;
        

            /**
             *----Scale nx by the inverse length of the vector (i.e., edi). The high
             *----and low 32 bits of the 64-bit product are stored in edx and eax,
             *----respectively.
             */
            mov      eax, nx;
            imul  edi;


            /**
             *----Move the 64-bit normalized nx component from registers [edx:eax] to
             *----registers [ebx:esi].
             */
            mov      ebx, edx;
            mov      esi, eax;
        

            /**
             *----Scale ny by the inverse length of the vector (i.e., edi). The high
             *----and low 32 bits of the 64-bit product are stored in edx and eax,
             *----respectively.
             */
            mov      eax, ny;
            imul  edi;


            /**
             *----Registers [ebx:esi] contain the normalized 64-bit vector component
             *----nx. Registers [edx:eax] contain the normalized 64-bit vector
             *----component ny. Register ecx contains the normalization shift s.
             *----Compare the shift amount to 32. If it is less than 32 (i.e.,
             *----processor flag SF == 1), then jump to shiftLess32.
             */
            cmp      ecx, 32;
            js      shiftLess32;


            /**
             *----The shift amount is at least 32. Therefore, it is sufficient to
             *----operate only on the high 32 bits, because the low 32 bits are
             *----completely shifted off the right end. Use arithmetic shifts because
             *----nx and ny are signed values.
             */
            sub      ecx, 32;
            sar      ebx, cl;
            sar      edx, cl;


            /**
             *----Store the normalized vector components and jump to nonDegenerate
             */
            mov      nx, ebx;
            mov      ny, edx;
            jmp      nonDegenerate;


            /**
             *----The shift amount is less than 32. Therefore, it is necessary to
             *----merge the high 32 bits and the low 32 bits of the 64-bit result
             *----into a single 32-bit 16.16 fixed point value. Shift the low 32 bits
             *----of the normalized nx and ny components to the required bit
             *----positions.
             */
            shiftLess32:
            shr      esi, cl;
            shr      eax, cl;


            /**
             *----Set the shift s to (32 - s)
             */
            sub      ecx, 32;
            neg      ecx;


            /**
             *----Shift the high 32 bits of the normalized nx and ny components to
             *----the required bit positions
             */
            shl      ebx, cl;
            shl      edx, cl;


            /**
             *----Combine the high and low 32 bits of the normalized nx and ny
             *----components into the final 32-bit integers
             */
            or      ebx, esi;
            or      edx, eax;


            /**
             *----Store the normalized vector components and jump to nonDegenerate
             */
            mov      nx, ebx;
            mov      ny, edx;
            jmp      nonDegenerate;

        
            /**
             *----nx is non-zero and ny is zero. Therefore, the unit vector is either
             *----(+1, 0) or (-1, 0). Set the y component of the final unit vector to
             *----zero.
             */
            nyZero:
            mov      ny, 0;


            /**
             *----The processor flags currently reflect the state of ny, not nx.
             *----Check the processor flags for nx (i.e., edi) to determine if nx is
             *----positive or negative.
             */
            test  edi, edi;


            /**
             *----If nx (i.e., edi) is negative (i.e., processor flag SF == 1), then
             *----jump to leftUnitVector
             */
            js      leftUnitVector;


            /**
             *----nx is positive, so the unit vector is (+1, 0). Set the x component
             *----of the final unit vector to 1 and jump to nonDegenerate.
             */
            mov      nx, 0x00010000;
            jmp      nonDegenerate;


            /**
             *----nx is negative and ny is zero, so the unit vector is (-1, 0). Set
             *----the x component of the final unit vector to -1 and jump to
             *----nonDegenerate.
             */
            leftUnitVector:
            mov      nx, 0xffff0000;
            jmp      nonDegenerate;


            /**
             *----nx and ny are both zero (i.e., a degenerate input vector). Set the
             *----return value to zero to indicate that the length of the input
             *----vector is zero and jump to the end of this assembly block.
             */
            degenerate:
            mov      returnValue, 0;
            jmp      end;


            /**
             *----Set the return value to 1 to indicate that the length of the input
             *----vector is non-zero
             */
            nonDegenerate:
            mov      returnValue, 1;
        

            /**
             *----Exit
             */
            end:
        }


        /**
         *----Store the unit vector in (nxOut, nyOut) and return the computed return
         *----value
         */
        *nxOut = nx;
        *nyOut = ny;
        return(returnValue);
    }
#endif
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the positive square root of n. The input n is a non-negative
 *    32-bit I.F fixed point value (i.e., n has I integer bits and F fractional bits,
 *    where I + F = 32) and the computed result is a 32-bit I'.F' fixed point value
 *    (i.e., the computed result has I' integer bits and F' fractional bits, where I' +
 *    F' = 32).
 *
 *    The values of I, F, I', and F' are not passed directly as inputs to the SQRT()
 *    function. Instead, the caller must pass shift as an input to the SQRT() function,
 *    where shift is a non-negative integer such that
 *
 *        shift = 46 - F' + F/2
 *
 *    SQRT() can be called for a wide range of values of I, F, I', and F'. For example,
 *    if the input n is a 24.8 fixed point value (i.e., I = 24 and F = 8) and the
 *    desired output format is a 16.16 fixed point value (i.e., I' = F' = 16), then the
 *    calling function should compute shift as (46 - 16 + 8/2) = 30 + 8/2 = 30 + 4 =
 *    34.
 *
 *    If n is zero, SQRT() returns exactly zero.
 *
 *    All assembly and C implementations produce bit-identical results.
 *
 *    Implementation notes:
 *
 *      - SQRT() computes the square root of n as:
 *
 *            sqrt(n) = n^(1/2)
 *                    = n^(1 - 1/2)
 *                    = n^1 * n^(-1/2)
 *                    = n * n^(-1/2)
 *                    = n * rsqrt(n)
 *
 *        where rsqrt(n) denotes the reciprocal square root of n.
 *
 *      - SQRT() computes the reciprocal square root of n by first normalizing n to a
 *        0.32 fixed point value that lies in the mathematical range [0.25, 1) and then
 *        calling the RSQ() function. The RSQ() function computes the reciprocal square
 *        root as a 2.30 fixed point value (see the documentation for RSQ() above for
 *        more details).
 *
 *      - To maximize intermediate fractional precision, SQRT() postpones the
 *        unnormalization of the computed reciprocal square root until the final step.
 *        As explained in the above section Fixed Point Implementation Notes On
 *        Computing Reciprocal Square Roots, the unnormalization step requires shifting
 *        the computed reciprocal square root to the right by (16 - 0.5 * (F + s))
 *        bits, where F is the number of fractional bits in the input (i.e., n) and s
 *        is the normalization shift amount required to normalize n to a 0.32 fixed
 *        point value that lies in the mathematical range [0.25, 1).
 *
 *      - Note that n is an I.F fixed point value and that the computed reciprocal
 *        square root is a 2.30 fixed point value. Therefore, the product (n *
 *        rsqrt(n)) is a 64-bit {2+I}.{30+F} fixed point value. Converting this 64-bit
 *        product to a 32-bit I'.F' fixed point value requires shifting the 64-bit
 *        product to the right by (30 + F - F') bits.
 *
 *      - Therefore, the unnormalization step and the conversion from a {2+I}.{30+F}
 *        fixed point value to an I'.F' fixed point value can be combined into a single
 *        right shift of ((30 + F - F') + (16 ' 0.5 * (F + s))) bits. This expression
 *        simplifies to (46 - F' + F/2 - s/2) bits.
 *
 *      - Note that the shift parameter is defined as (46 - F' + F/2). Consequently,
 *        the final step of the SQRT() function is to shift the 64-bit product (n *
 *        rsqrt(n)) to the right by (shift - s/2) bits.
 *
 *      - The documentation below uses a running example to help explain the fixed
 *        point implementation. This running example considers the case where the input
 *        value n is the ADF_I1616 fixed point value 0x3e80000, which has the
 *        mathematical value (0x3e80000 / 2^16) = 1000, and the desired output is an
 *        ADF_I1616 fixed point value. Therefore, I = F = I' = F' = 16, and the shift
 *        parameter is set to (46 - 16 + 16/2) = (30 + 8) = 38.
 *-----------------------------------------------------------------------------------
 */
static ADF_U32 SQRT (ADF_U32 n, ADF_I32 shift)
{
    ADF_U32 b;
    ADF_U32 s;
    ADF_U32 rsq;
    ADF_U32 sqrt;
    ADF_U32 sqrtHigh, sqrtLow;

        
    /**
     *----If n is zero, return zero
     */
    if (!n) return(0);

        
    /**
     *----Determine an even integer s such that the expression (n << s) is a 0.32
     *----fixed point value that lies in the mathematical range [0.25, 1). Running
     *----example: consider the case where n is the 16.16 fixed point value
     *----0x3e80000, which has the mathematical value (0x3e80000 / 2^16) = 1000.
     *----CountLeadingZeroes(0x3e80000) is 6, because 0x3e80000 contains 6 zeroes
     *----before its leading 1 bit. Therefore, s = (6 & 0xfffffffe) = 6.
     */
    s = (CountLeadingZeroes(n) & 0xfffffffe);

        
    /**
     *----Normalize n by shifting n to the left by s bits. The computed value b is a
     *----0.32 fixed point value that lies in the mathematical range [0.25, 1).
     *----Running example: b = (n << s) = (0x3e80000 << 6) = 0xfa000000, which is a
     *----0.32 fixed point with the mathematical value (0xfa000000 / 2^32) =
     *----0.9765625.
     */
    b = (n << s);

    
    /**
     *----Compute the reciprocal square root of b. Note that the RSQ() function
     *----computes the result as a 2.30 fixed point value. Running example: rsq =
     *----RSQ(b) = RSQ(0xfa000000) = 0x40c3713a, which has the mathematical value
     *----(0x40c3713a / 2^30) = 1.01192885078489780426025390625.
     */
    rsq = RSQ(b);


    /**
     *----Compute the 64-bit {2+I}.{30+F} square root of n as sqrt(n) = n / sqrt(n) =
     *----n * rsqrt(n). Note that n is an I.F fixed point value and that rsq is a
     *----2.30 fixed point value. Therefore, their product is a 64-bit {2+I}.{30+F}
     *----fixed point value. Set sqrtHigh and sqrtLow to the high 32 bits and the low
     *----32 bits of the 64-bit product, respectively. Running example: n * rsq =
     *----(0x3e80000 * 0x40c3713a) = 0xfcfb724a900000. Since n is an 16.16 fixed
     *----point value, the product is a 64-bit 18.46 fixed point value. sqrtHigh is
     *----set to 0x00fcfb72 and sqrtLow is set to 0x4a900000.
     */
    UMUL64(*(ADF_U32 *) &n, rsq, &sqrtHigh, &sqrtLow);


    /**
     *----Determine the shift amount required to unnormalize the computed square root
     *----and convert the result to an I'.F' fixed point value. According to the
     *----section Fixed Point Implementation Notes On Computing Reciprocal Square
     *----Roots (see above), unnormalizing rsq requires shifting rsq to the right by
     *----(16 - 0.5 * (F + s)) bits. Furthermore, converting the 64-bit {2+I}.{30+F}
     *----fixed point square root to an I'.F' fixed point value requires shifting the
     *----64-bit value to the right by (30 + F - F') bits. Therefore, the total shift
     *----amount required is (46 - F' + F/2 - s/2) = (shift - s/2) bits. Running
     *----example: n is a 16.16 fixed point value and the output is a 16.16 fixed
     *----point value, so F = F' = 16. Therefore, s = (shift - (s >> 1)) = (46 - F' +
     *----F/2 - s/2) = (46 - 16 + 16/2 - 6/2) = (30 + 8 - 3) = (38 - 3) = 35.
     */
    s = (shift - (s >> 1));


    /**
     *----Compare the shift amount to 32
     */
    if (s >= 32) {

        
        /**
         *----The shift amount is at least 32. Therefore, it is sufficient to shift
         *----only the high 32 bits, because the low 32 bits are completely shifted
         *----off the right end. Running example: sqrt = (sqrtHigh >> (s - 32)) =
         *----(0x00fcfb72 >> (35 - 32)) = (0x00fcfb72 >> 3) = 0x001f9f6e, which has
         *----the mathematical value (0x001f9f6e / 2^16) = 31.622772216796875. Note
         *----that the true value of the square root of 1000 is approximately
         *----31.62277660168379331998893544.
         */
        sqrt = (sqrtHigh >> (s - 32));

            
    } else {


        /**
         *----The shift amount is less than 32. Therefore, it is necessary to merge
         *----the high 32 bits and the low 32 bits of the 64-bit {2+I}.{30+F} fixed
         *----point value into a single 32-bit I'.F' fixed point value.
         */
        sqrt = (sqrtHigh << (32 - s)) | (sqrtLow >> s);
    }


    /**
     *----Return the computed square root
     */
    return(sqrt);
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the positive square root of n, where both n and the computed
 *    result are non-negative ADF_I1616 fixed point values.
 *
 *    Special cases are handled as follows. If n is zero, I1616_SQRT() returns exactly
 *    zero. If n is negative, the result is undefined.
 *
 *    All assembly and C implementations produce bit-identical results.
 *
 *    Implementation notes:
 *
 *      - I1616_SQRT() computes the square root of n as:
 *
 *            sqrt(n) = n^(1/2)
 *                    = n^(1 - 1/2)
 *                    = n^1 * n^(-1/2)
 *                    = n * n^(-1/2)
 *                    = n * rsqrt(n)
 *
 *        where rsqrt(n) denotes the reciprocal square root of n.
 *
 *      - Although this approach can be implemented directly using the expression
 *        I1616_MUL(n, I1616_RSQ(n)), the computed result is highly inaccurate because
 *        14 intermediate fractional bits are lost when I1616_RSQ() converts the
 *        reciprocal square root from its internal 2.30 fixed point representation to a
 *        16.16 fixed point value (see above for more information about the I1616_RSQ()
 *        function). Consequently, the computed product of n and I1616_RSQ(n) severely
 *        underestimates the true product. This underestimate is significant when n is
 *        large, because the reciprocal square root of a large number is a small number
 *        (i.e., a value whose binary representation contains 1 bits in only the low
 *        fractional bits).
 *
 *      - To avoid this problem, I1616_SQRT() uses the high-precision SQRT() function
 *        (see above) with the shift argument set to 38. Following the notation used in
 *        the documentation for the SQRT() function, the input is a 16.16 fixed point
 *        value (i.e., I = F = 16) and the output is also a 16.16 fixed point value
 *        (i.e., I' = F' = 16). Therefore shift = 46 - F' + F/2 = 46 - 16 + 16/2 = 30 +
 *        8 = 38.
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I1616_SQRT (ADF_I1616 n)
{
    return((ADF_I1616) SQRT((ADF_U32) n, 38));
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the positive square root of n, where n is a non-negative
 *    ADF_I2408 fixed point value and the computed result is an ADF_I1616 fixed point
 *    value.
 *
 *    Special cases are handled as follows. If n is zero, I2408_SQRT_I1616() returns
 *    exactly zero. If n is negative, the result is undefined.
 *
 *    All assembly and C implementations produce bit-identical results.
 *
 *    Implementation notes:
 *
 *      - I2408_SQRT_I1616() computes the square root of n using the same approach used
 *        by the I1616_SQRT() function (see above). In this case, the shift argument
 *        (i.e., the second argument to the SQRT() function) is set to 34 instead of
 *        38. Following the notation used in the documentation for the SQRT() function
 *        (see above), the input is a 24.8 fixed point value (i.e., I = 24 and F = 8)
 *        and the output is a 16.16 fixed point value (i.e., I' = F' = 16). Therefore
 *        shift = 46 - F' + F/2 = 46 - 16 + 8/2 = 30 + 4 = 34.
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I2408_SQRT_I1616 (ADF_I2408 n)
{
    return((ADF_I1616) SQRT((ADF_U32) n, 34));
}

#endif /* FS_EDGE_RENDER */

/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the unsigned quotient of n and d (i.e., n/d). The numerator n
 *    is a 0.32 fixed point value that lies in the mathematical range [0.25, 0.5) and
 *    the denominator d is a 0.32 fixed point value that lies in the mathematical range
 *    [0.5, 1). Therefore 0.25 <= n < 0.5 <= d < 1. Considered as 32-bit unsigned
 *    integers, n must lie in the range [2^30, 2^31) and d must be at least 2^31. The
 *    computed result is a 1.31 fixed point value whose leading 1 bit is at bit
 *    position 30.
 *
 *    All assembly and C implementations produce bit-identical results.
 *
 *    Implementation notes:
 *
 *      - The quotient of n and d is computed in two steps:
 *
 *          1. Compute the reciprocal of d (i.e., 1/d). The reciprocal estimation
 *             method consists of a table lookup followed by two iterations of
 *             Newton-Raphson. The Newton-Raphson iteration equation to compute a
 *             reciprocal is x{i+1} = x{i} * (2 - d * x{i}), where x{i} is the estimate
 *             of the solution in the current iteration (i.e., iteration i) and x{i+1}
 *             is the computed estimate in the next iteration (i.e., iteration (i +
 *             1)). See the above section Newton-Raphson Method Overview for an
 *             overview of the theory behind this numerical technique and for the
 *             derivation of the above formula.
 *
 *             A table lookup on the leading bits of the input d is used to obtain an
 *             initial approximation to the reciprocal of d. See the documentation for
 *             divTable[] below for more details about the table.
 *
 *          2. Multiply the reciprocal of d (i.e., 1/d) by n.
 *
 *      - The documentation for the C implementation below uses a running example to
 *        help explain the fixed point implementation. This running example considers
 *        the case where the input value n is the 0.32 fixed point value 0x40000000,
 *        which has the mathematical value (0x40000000 / 2^32) = 0.25, and the input
 *        value d is the 0.32 fixed point value 0x80000000, which has the mathematical
 *        value (0x80000000 / 2^32) = 0.5. The true quotient (n / d) is 0.5.
 *
 *    Performance notes:
 *
 *    Intel Centrino Core Duo T2500 (2 MB L2, 2.0 GHz, FSB 677 MHz), MSVC 6 compiler,
 *    Release mode:
 *      - ADF_MATH_FIXED_C_64       is ~1.3x as fast as ADF_MATH_FIXED_C_32
 *      - ADF_MATH_FIXED_ASM_X86 is ~1.6x as fast as ADF_MATH_FIXED_C_64
 *      - ADF_MATH_FIXED_ASM_X86 is ~2.0x as fast as ADF_MATH_FIXED_C_32
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 *    divTable[] is the reciprocal lookup table used by the DIV() function. The table
 *    contains 128 elements and is precomputed as follows:
 *
 *    divTable[i] = round(65536.0 / (i + 128.5)) - 256, where i is an integer that lies
 *    in the range [0, 127]. The following discussion explains this formula.
 *
 *    The purpose of divTable[] is to provide a fast and reasonably accurate initial
 *    estimate to the reciprocal of a number d.
 *
 *    Assume that d is a 0.32 fixed point value that lies in the mathematical range
 *    (0.5, 1). Considered as an unsigned integer, d is greater than 2^31. Therefore,
 *    the MSB of d is 1.
 *
 *    The reciprocal of each value in the range (0.5, 1) must lie in the range (1, 2).
 *    Therefore, the reciprocal of each value in the range (0.5, 1) can be expressed as
 *    a 24.8 fixed point value, where the integer portion of the 24.8 fixed point value
 *    is always 1. For example, the reciprocal of 0.7 can be expressed as the 24.8
 *    fixed point value 0x16e, which has the mathematical value (0x16e / 2^8) =
 *    1.4296875. Consequently, the table only needs to store the low 8 bits (i.e., the
 *    fractional bits) of the reciprocal of each value in the range (0.5, 1). Storing
 *    the integer portion of the 24.8 fixed point value in the table is unnecessary
 *    because it is always 1.
 *
 *    For a good tradeoff between table size and the accuracy of the initial estimate,
 *    this implementation uses the seven fractional bits of d following the leading 1
 *    to index into the table, thereby limiting the table size to 128 elements. Note
 *    that the MSB of d is not used to index into the table because it is always 1.
 *
 *    Computing the table is accomplished with the following steps:
 *
 *    Let i be the 7-bit integer index formed by the seven bits following the leading 1
 *    of d (i.e., bits 30:24 of d).
 *
 *    First, map an integer index i in the range [0, 127] to the range [0.501953125,
 *    0.998046875], which is an approximation to the range (0.5, 1):
 *
 *      (i + 128.5) / 256.0
 *
 *    The value of 128.5 (instead of 128.0) is chosen for technical reasons explained
 *    below. Next, compute the reciprocal:
 *
 *      256.0 / (i + 128.5)
 *
 *    Scale the result by 256.0 and round to the nearest integer to obtain a 24.8 fixed
 *    point value:
 *
 *      round(65536.0 / (i + 128.5))
 *
 *    Finally, subtract (i.e., remove) the integer portion of the 24.8 fixed point
 *    value because the integer portion is always 1. Putting all the steps together
 *    yields the following formula:
 *
 *      divTable[i] = round(65536.0 / (i + 128.5)) - 256
 *
 *    The reason for choosing 128.5 instead of 128.0 is to handle the case where the
 *    index i is zero. Suppose the value of 128.0 is used instead. Then
 *
 *      divTable[0] = round(65536.0 / (0 + 128.0)) - 256
 *                  = round(65536.0 / (128.0)) - 256
 *                  = round(512.0) - 256
 *                  = 512 - 256
 *                  = 256
 *
 *    The value 256 requires 9 bits to store. By choosing a value slightly larger than
 *    128.0, such as 128.5, the following result is obtained:
 *
 *      rsqTable[0] = round(65536.0 / (0 + 128.5)) - 256
 *                  = round(65536.0 / (128.5)) - 256
 *                  = round(510.0077821011673151750972762645965536.0 / (128.5)) - 256
 *                  = 510 - 256
 *                  = 254
 *
 *    The value 254 requires only 8 bits to store. Therefore the entire table can be
 *    stored in just 128 bytes.
 *
 *    Although the above discussion has assumed that d lies in the mathematical range
 *    (0.5, 1), the DIV() function uses divTable[] to compute reciprocals of values
 *    that lie in the mathematical range [0.5, 1) (note the inclusion of 0.5). The
 *    first element of divTable[] (i.e., divTable[0]) is an estimate of the reciprocal
 *    of 0.5.
 *-----------------------------------------------------------------------------------
 */
static FS_CONST ADF_U8 divTable[] = {
    0xfe, 0xfa, 0xf6, 0xf2, 0xef, 0xeb, 0xe7, 0xe4, 
    0xe0, 0xdd, 0xd9, 0xd6, 0xd2, 0xcf, 0xcc, 0xc9, 
    0xc6, 0xc2, 0xbf, 0xbc, 0xb9, 0xb6, 0xb3, 0xb1, 
    0xae, 0xab, 0xa8, 0xa5, 0xa3, 0xa0, 0x9d, 0x9b, 
    0x98, 0x96, 0x93, 0x91, 0x8e, 0x8c, 0x8a, 0x87, 
    0x85, 0x83, 0x80, 0x7e, 0x7c, 0x7a, 0x78, 0x75, 
    0x73, 0x71, 0x6f, 0x6d, 0x6b, 0x69, 0x67, 0x65, 
    0x63, 0x61, 0x5f, 0x5e, 0x5c, 0x5a, 0x58, 0x56, 
    0x54, 0x53, 0x51, 0x4f, 0x4e, 0x4c, 0x4a, 0x49, 
    0x47, 0x45, 0x44, 0x42, 0x40, 0x3f, 0x3d, 0x3c, 
    0x3a, 0x39, 0x37, 0x36, 0x34, 0x33, 0x32, 0x30, 
    0x2f, 0x2d, 0x2c, 0x2b, 0x29, 0x28, 0x27, 0x25, 
    0x24, 0x23, 0x21, 0x20, 0x1f, 0x1e, 0x1c, 0x1b, 
    0x1a, 0x19, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 
    0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 
    0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 
};
/**
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 */
static ADF_U32 DIV (ADF_U32 n, ADF_U32 d)
{
#if ((ADF_MATH_MODE == ADF_MATH_FIXED_C_32) || \
    (ADF_MATH_MODE == ADF_MATH_FIXED_C_64))
    {
        /**
         *----C implementation (32-bit and 64-bit). x0 is the initial estimate
         *----obtained by a table lookup. x1 is the estimate obtained after the first
         *----Newton-Raphson iteration. x2 is the estimate obtained after the second
         *----Newton-Raphson iteration. x2Low and x2High are used for temporary
         *----storage.
         */
        ADF_U32 x0, x1, x2;
        ADF_U32 x2Low, x2High;


        /**
         *----Note that the MSB of d is 1. Perform a table lookup using the seven
         *----bits of d following the MSB (i.e., bits 30:24) to obtain the 8
         *----fractional bits of a 24.8 fixed point estimate to 1/d. Add the integer
         *----portion of the 24.8 fixed point estimate (i.e., 0x100), which always
         *----has the mathematical value 1. The result is a 24.8 fixed point estimate
         *----to 1/d that lies in the mathematical range (1, 2). Running example:
         *----consider the case where d = 0x80000000 (i.e., the mathematical value
         *----(0x80000000 / 2^32) = 0.5). x0 = divTable[(d >> 24) - 128] + 0x100 =
         *----divTable[(0x80000000 >> 24) - 128] + 0x100 = divTable[128 - 128] +
         *----0x100 = divTable[0] + 0x100 = 0xfe + 0x100 = 0x1fe, which has the
         *----mathematical value (0x1fe / 2^8) = 1.9921875. This is the initial
         *----estimate of 1/0.5.
         */
        x0 = divTable[(d >> 24) - 128] + 0x100;


        /**
         *----Begin the first Newton-Raphson iteration. Compute the 16.16 fixed point
         *----value (x0 * x0). Note that x0 is a 24.8 fixed point value that lies in
         *----the mathematical range (1, 2), so the product (x0 * x0) is a 64-bit
         *----48.16 fixed point value whose high 32 bits are all zero. Therefore, it
         *----suffices to keep only the low 32 bits of the 64-bit product,
         *----effectively converting the 48.16 fixed point value to a 16.16 fixed
         *----point value. Running example: x1 = x0 * x0 = 0x1fe * 0x1fe = 0x3f804,
         *----which has the mathematical value (0x3f804 / 2^16) = 3.96881103515625.
         */
        x1 = x0 * x0;


        /**
         *----Compute the 16.16 fixed point value (d * x0 * x0). Note that d is a
         *----0.32 fixed point value and x1 is a 16.16 fixed point value, so the
         *----product of d and x1 is a 64-bit 16.48 fixed point value. Keeping only
         *----the high 32 bits of the 64-bit product effectively truncates the 16.48
         *----fixed point value to a 16.16 fixed point value. Running example: d * x1
         *----= 0x80000000 * 0x3f804 = 0x1fc0200000000. Keeping only the high 32 bits
         *----yields x1 = 0x1fc02, which has the mathematical value (0x1fc02 / 2^16)
         *----= 1.984405517578125.
         */
        x1 = UMUL64_HIGH32(d, x1);


        /**
         *----Convert x0 from a 24.8 fixed point value to a 16.16 fixed point value
         *----(i.e., shift x0 to the left by 8 bits) and multiply x0 by two (i.e.,
         *----shift x0 to the left by an additional bit). Then compute the 16.16
         *----fixed point value ((2 * x0) - (d * x0 * x0)). This is the end of the
         *----first Newton-Raphson iteration. Running example: x1 = (x0 << 9) - x1 =
         *----(0x1fe << 9) - 0x1fc02 = 0x3fc00 - 0x1fc02 = 0x1fffe, which has 
         *----the mathematical value (0x1fffe / 2^16) = 1.999969482421875. Note that
         *----x1 is a better estimate of 1/0.5 than x0.
         */
        x1 = (x0 << 9) - x1;


        /**
         *----Begin the second Newton-Raphson iteration. Compute the 64-bit 32.32
         *----fixed point value (x1 * x1). Set x2High to the high 32 bits of the
         *----64-bit product and set x2Low to the low 32 bits of the 64-bit product.
         *----Note that x1 is a 16.16 fixed point estimate to 1/d that lies in the
         *----mathematical range (1, 2). Therefore, the high 30 bits of x2High are
         *----zero but at least one of the low two bits of x2High is 1. Running
         *----example: x1 * x1 = 0x3fff80004, which has the mathematical value
         *----(0x3fff80004 / 2^32) = 3.999877930618822574615478515625. x2High = 0x3
         *----and x2Low = 0xfff80004. Note that the low two bits of x2High are both
         *----1.
         */
        UMUL64(x1, x1, &x2High, &x2Low);

        
        /**
         *----Convert the 32.32 fixed point value (x1 * x1) to a 33.31 fixed point
         *----value by shifting (x1 * x1) to the right by 1 bit and rounding the
         *----result. Note that because the variable x2 can only store 32 bits, bit 1
         *----of x2High is not stored in x2. The potential error due to this lost bit
         *----is corrected in a separate step below. Running example: x2 = (x2Low >>
         *----1) + (x2High << 31) + (x2Low & 1) = (0xfff80004 >> 1) + (0x3 << 31) +
         *----(0xfff80004 & 1) = (0x7ffc0002) + (0x180000000) + (0) = 0x1fffc0002.
         *----Note that this value is stored as 0xfffc0002 (i.e., the high bit is
         *----lost).
         */
        x2 = (x2Low >> 1) + (x2High << 31) + (x2Low & 1);


        /**
         *----Compute the 1.31 fixed point value (d * x1 * x1). Note that d is a 0.32
         *----fixed point value and x2 is a 1.31 fixed point value, so the product of
         *----d and x1 is a 64-bit 1.63 fixed point value. Keeping only the high 32
         *----bits of the 64-bit product effectively truncates the result to a 1.31
         *----fixed point value. Running example: d * x2 = 0x80000000 * 0xfffc0002 =
         *----0x7ffe000100000000. Keeping only the high 32 bits yields x2 =
         *----0x7ffe0001, which has the mathematical value (0x0x7ffe0001 / 2^31) =
         *----0.9999389653094112873077392578125.
         */
        x2 = UMUL64_HIGH32(d, x2);


        /**
         *----Recall that bit 1 of x2High was lost when converting the 32.32 fixed
         *----point value (x1 * x1) to a 33.31 fixed point value and storing the
         *----result into a 32-bit integer (see above). If bit 1 of x2High is 1, then
         *----correct for this error by adding the 1.31 fixed point value (2 * d) to
         *----the 1.31 fixed point value x2. Since d is a 0.32 fixed point value, it
         *----is already the desired 1.31 fixed point value (2 * d). Therefore,
         *----simply add d to x2. Running example: x2High is 0x3, so bit 1 of x2High
         *----is 1. Therefore, x2 is set to x2 + d = 0x7ffe0001 + 0x80000000 =
         *----0xfffe0001, which has the mathematical value (0xfffe0001 / 2^31) =
         *----1.9999389653094112873077392578125.
         */
        if (x2High & 2) x2 += d;


        /**
         *----Compute the 1.31 fixed point value ((2 * x1) - (d * x1 * x1)). Note
         *----that shifting x1 to the left by 15 bits converts x1 from a 16.16 fixed
         *----point value to a 1.31 fixed point value. Shifting x1 to the left by an
         *----additional bit effectively multiplies x1 by 2. This is the end of the
         *----second Newton-Raphson iteration. Running example: x2 = (x1 << 16) - x2
         *----= (0x1fffe << 16) - 0xfffe0001 = 0x1fffe0000 - 0xfffe0001 = 0xffffffff,
         *----which has the mathematical value (0xffffffff / 2^31) =
         *----1.9999999995343387126922607421875. Note that x2 is a better estimate of
         *----1/0.5 than x1. The true value of 1/0.5 is 2.
         */
        x2 = (x1 << 16) - x2;


        /**
         *----Compute and return the 1.31 fixed point value n/d. Evaluate n/d by
         *----multiplying 1/d (i.e., x2) by n. Note that x2 is a 1.31 fixed point
         *----value and n is a 0.32 fixed point value, so their product is a 64-bit
         *----1.63 fixed point value. Keeping only the high 32 bits of the 64-bit
         *----product effectively truncates the result to a 1.31 fixed point value.
         *----Since the leading 1 of x2 lies at bit position 31 (i.e., the MSB) and
         *----the leading 1 of n lies at bit position 30, the leading 1 of the
         *----resulting 1.31 fixed point value n/d lies at bit position 30. Running
         *----example: consider the case where n is the 0.32 fixed point value
         *----0x40000000, which has the mathematical value (0x40000000 / 2^32) =
         *----0.25. Then n * x2 = 0x40000000 * 0xffffffff = 0x3fffffffc0000000.
         *----Keeping only the high 32 bits yields the result 0x3fffffff, which has
         *----the mathematical value (0x3fffffff / 2^31 =
         *----0.4999999995343387126922607421875. Note that the true value of n/d in
         *----this example is 0.25/0.5 = 0.5.
         */
        return(UMUL64_HIGH32(n, x2));
    }
#elif (ADF_MATH_MODE == ADF_MATH_FIXED_ASM_X86)
    {
        /**
         *----x86 assembly implementation
         */
        __asm {


            /**
             *----Set ecx and edi to the denominator d
             */
            mov      ecx, d;
            mov      edi, d;


            /**
             *----Note that the MSB of d is 1. Perform a table lookup using the seven
             *----bits of d following the MSB (i.e., bits 30:24) to obtain the 8
             *----fractional bits of a 24.8 fixed point estimate to 1/d. Add the
             *----integer portion of the 24.8 fixed point estimate (i.e., 0x100),
             *----which always has the mathematical value 1. The result is a 24.8
             *----fixed point estimate to 1/d that lies in the mathematical range (1,
             *----2).
             */
            shr      edi, 24;
            sub      edi, 128;
            mov      eax, 0;
            mov      al,  divTable[edi];
            add      eax, 256;


            /**
             *----Let x0 be the 24.8 fixed point estimate to 1/d (i.e., eax). Copy x0
             *----to ebx.
             */
            mov      ebx, eax;


            /**
             *----Begin the first Newton-Raphson iteration. Compute the 16.16 fixed
             *----point value [edx:eax] = (x0 * x0). Note that x0 is a 24.8 fixed
             *----point value that lies in the mathematical range (1, 2), so the
             *----product (x0 * x0) is a 64-bit 48.16 fixed point value whose high 32
             *----bits are all zero. Therefore, it suffices to keep only the low 32
             *----bits of the 64-bit product, effectively converting the 48.16 fixed
             *----point value to a 16.16 fixed point value.
             */
            imul  eax, eax;

        
            /**
             *----Compute the 16.16 fixed point value edx = (d * x0 * x0). Note that
             *----d is a 0.32 fixed point value and x1 is a 16.16 fixed point value,
             *----so the product of d and x1 is a 64-bit 16.48 fixed point value.
             *----Keeping only the high 32 bits of the 64-bit product effectively
             *----truncates the 16.48 fixed point value to a 16.16 fixed point value.
             */
            mul      ecx;

        
            /**
             *----Convert x0 from a 24.8 fixed point value to a 16.16 fixed point
             *----value (i.e., shift x0 to the left by 8 bits) and multiply x0 by two
             *----(i.e., shift x0 to the left by an additional bit). Then compute the
             *----16.16 fixed point value ((2 * x0) - (d * x0 * x0)). This is the end
             *----of the first Newton-Raphson iteration.
             */
            shl      ebx, 9;
            sub      ebx, edx;


            /**
             *----Begin the second Newton-Raphson iteration. Compute the 64-bit 32.32
             *----fixed point value [edx:eax] = (x1 * x1). Set edx to the high 32
             *----bits of the 64-bit product and set eax to the low 32 bits of the
             *----64-bit product. Note that x1 is a 16.16 fixed point estimate to 1/d
             *----that lies in the mathematical range (1, 2). Therefore, the high 30
             *----bits of edx are zero but at least one of the low two bits of edx is
             *----1.
             */
            mov      eax, ebx;
            mul      eax;

        
            /**
             *----Convert the 32.32 fixed point value (x1 * x1) to a 33.31 fixed
             *----point value by shifting (x1 * x1) to the right by 1 bit and
             *----rounding the result. Note that because the register eax can only
             *----store 32 bits, bit 1 of edx is not stored in eax. The potential
             *----error due to this lost bit is corrected in a separate step below.
             */
            mov      edi, edx;
            shl      edx, 31;
            shr      eax, 1; 
            adc      eax, edx;

        
            /**
             *----Compute the 1.31 fixed point value edx = (d * x1 * x1). Note that d
             *----is a 0.32 fixed point value and x2 is a 1.31 fixed point value, so
             *----the product of d and x1 is a 64-bit 1.63 fixed point value. Keeping
             *----only the high 32 bits of the 64-bit product effectively truncates
             *----the result to a 1.31 fixed point value.
             */
            mul      ecx;

        
            /**
             *----Recall that bit 1 of edx was lost when converting the 32.32 fixed
             *----point value (x1 * x1) to a 33.31 fixed point value and storing the
             *----result into a 32-bit integer (see above). If bit 1 of edi is 1,
             *----then correct for this error by adding the 1.31 fixed point value (2
             *----* d) to the 1.31 fixed point value x2. Since d is a 0.32 fixed
             *----point value, it is already the desired 1.31 fixed point value (2 *
             *----d). Therefore, simply add d to x2. 
             */
            shl      edi, 30;
            jns      end;
            add      edx, ecx;


            /**
             *----Compute the 1.31 fixed point value ((2 * x1) - (d * x1 * x1)). Note
             *----that shifting x1 to the left by 15 bits converts x1 from a 16.16
             *----fixed point value to a 1.31 fixed point value. Shifting x1 to the
             *----left by an additional bit effectively multiplies x1 by 2. This is
             *----the end of the second Newton-Raphson iteration. 
             */
            end:
            shl      ebx, 16;
            sub      ebx, edx;


            /**
             *----Compute and return the 1.31 fixed point value n/d. Evaluate n/d by
             *----multiplying 1/d (i.e., x2) by n. Note that x2 is a 1.31 fixed point
             *----value and n is a 0.32 fixed point value, so their product is a
             *----64-bit 1.63 fixed point value. Keeping only the high 32 bits of the
             *----64-bit product effectively truncates the result to a 1.31 fixed
             *----point value. Since the leading 1 of x2 lies at bit position 31
             *----(i.e., the MSB) and the leading 1 of n lies at bit position 30, the
             *----leading 1 of the resulting 1.31 fixed point value n/d lies at bit
             *----position 30. 
             */
            mov      eax, n;
            mul      ebx;

        
            /**
             *----Return the quotient
             */
            mov      eax, edx;
        }
    }
#endif
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the signed quotient (n / d). The input numerator n, the input
 *    denominator d, and the computed quotient are ADF_I1616 fixed point values. The
 *    quotient is rounded towards zero.
 *
 *    On output, I1616_DIV() sets status to ADF_FIXED_MATH_NO_ERROR,
 *    ADF_FIXED_MATH_OVERFLOW, ADF_FIXED_MATH_UNDERFLOW, or ADF_FIXED_MATH_NAN,
 *    depending on the outcome of the quotient computation. The possible cases are as
 *    follows:
 *
 *      1. If n is any value and d is zero, I1616_DIV() returns zero and sets status to
 *         ADF_FIXED_MATH_NAN.
 *
 *      2. If n is zero and d is non-zero, I1616_DIV() returns zero and sets status to
 *         ADF_FIXED_MATH_NO_ERROR.
 *
 *      3. If n is non-zero and d is 0x10000 (i.e., the mathematical value 1),
 *         I1616_DIV() returns n and sets status to ADF_FIXED_MATH_NO_ERROR.
 *
 *      4. If n is non-zero and d is 0xffffffff (i.e., the mathematical value -1),
 *         I1616_DIV() returns -n and sets status to ADF_FIXED_MATH_NO_ERROR.
 *
 *      5. If n and d are both non-zero and the quotient (n / d) overflows the
 *         ADF_I1616 fixed point representation, I1616_DIV() returns zero and sets
 *         status to ADF_FIXED_MATH_OVERFLOW.
 *
 *      6. If n and d are both non-zero and the quotient (n / d) underflows the
 *         ADF_I1616 fixed point representation, I1616_DIV() returns zero and sets
 *         status to ADF_FIXED_MATH_UNDERFLOW.
 *
 *      7. In all other cases, I1616_DIV() computes and returns the signed fixed point
 *         quotient (n / d) and sets status to ADF_FIXED_MATH_NO_ERROR.
 *    
 *    All assembly and C implementations produce bit-identical results.
 *
 *    Implementation notes:
 *
 *      - I1616_DIV() computes the quotient of n and d using the following steps:
 *
 *          1. Normalize abs(n) to a 0.32 fixed point value that lies in the
 *             mathematical range [0.25, 0.5). Normalize abs(d) to a 0.32 fixed point
 *             value that lies in the mathematical range [0.5, 1).
 *
 *          2. Compute the unsigned quotient of the normalized values of abs(n) and
 *             abs(d) by calling the DIV() function (see above).
 *
 *          3. Unnormalize the unsigned quotient and convert the result to a signed
 *             16.16 fixed point value.
 *
 *    Performance notes:
 *
 *    Intel Centrino Core Duo T2500 (2 MB L2, 2.0 GHz, FSB 677 MHz): MSVC 6 compiler,
 *    Release mode:
 *      - ADF_MATH_FIXED_C_64       is ~1.2x as fast as ADF_MATH_FIXED_C_32
 *      - ADF_MATH_FIXED_ASM_X86 is ~1.4x as fast as ADF_MATH_FIXED_C_64
 *      - ADF_MATH_FIXED_ASM_X86 is ~1.7x as fast as ADF_MATH_FIXED_C_32
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I1616_DIV (ADF_I1616 n, ADF_I1616 d, ADF_I32 *status)
{
    ADF_I32 s;
    ADF_U32 q;
    ADF_U32 nBits, dBits;
    ADF_I32 dShift, nShift;
    ADF_I32 nSign, dSign, qSign;


    /**
     *----Initialize status to FIXED_MATH_NO_ERROR (i.e., no error has occurred)
     */
    *status = ADF_FIXED_MATH_NO_ERROR;


    /**
     *----If the denominator d is zero, set status to ADF_FIXED_MATH_NAN and return
     *----zero
     */
    if (!d) {
        *status = ADF_FIXED_MATH_NAN;
        return(0);
    }


    /**
     *----If the numerator is zero, return zero
     */
    if (!n) return(0);


    /**
     *----If d is 0x10000 (i.e., the mathematical value 1), return n
     */
    if (d == 0x10000) return(n);


    /**
     *----If d is 0xffffffff (i.e., the mathematical value -1), return -n
     */
    if (d == 0xffffffff) return(-n);


    /**
     *----Extract the signs of n and d
     */
    nSign = n >> 31;
    dSign = d >> 31;


    /**
     *----Compute the sign of the quotient
     */
    qSign = nSign ^ dSign;


    /**
     *----Set nBits to abs(n) and set dBits to abs(d)
     */
    nBits = (n < 0) ? -n : n;
    dBits = (d < 0) ? -d : d;


    /**
     *----Compute the integer exponent dShift required to convert dBits from a 16.16
     *----fixed point value to a 0.32 fixed point value that is normalized to the
     *----mathematical range [0.5, 1) (i.e., determine dShift such that 0.5 <=
     *----((dBits * 2^dShift) / 2^32) < 1).
     */
    dShift = CountLeadingZeroes(dBits);


    /**
     *----Compute the integer exponent nShift required to convert nBits from a 16.16
     *----fixed point value to a 0.32 fixed point value that is normalized to the
     *----mathematical range [0.25, 0.5) (i.e., determine nShift such that 0.25 <=
     *----((nBits * 2^nShift) / 2^32) < 0.5).
     */
    nShift = CountLeadingZeroes(nBits) - 1;


    /**
     *----Normalize dBits to a 0.32 fixed point value that lies in the mathematical
     *----range [0.5, 1)
     */
    dBits <<= dShift;


    /**
     *----Normalize nBits to a 0.32 fixed point value that lies in the mathematical
     *----range [0.25, 0.5)
     */
    nBits <<= nShift;


    /**
     *----Determine the shift amount required to unnormalize the 1.31 fixed point
     *----value q and convert the result to a 16.16 fixed point value. Unnormalizing
     *----q requires shifting q to the right by (nShift - dShift) bits. To understand
     *----this formula, recall that nBits = (n * 2^nShift) and that dBits = (d *
     *----2^dShift) (see above). Therefore q = (nBits / dBits) = (n * 2^nShift) / (d
     *----* 2^dShift) = (n / d) * (2^nShift / 2^dShift) = (n / d) * 2^(nShift -
     *----dShift). It follows that (n / d) = (nBits / dBits) * 2^(dShift - nShift) =
     *----q * 2^(dShift - nShift). Therefore, unnormalizing q requires multiplying q
     *----by 2^(dShift - nShift), which is equivalent to shifting q to the right by
     *----(nShift - dShift) bits. Converting q from a 1.31 fixed point value to a
     *----16.16 fixed point value requires shifting q to the right by 15 bits.
     *----Therefore, q must be shifted to the right by a total of (15 + nShift -
     *----dShift) bits.
     */
    s = 15 + nShift - dShift;


    /**
     *----Determine if s is less than -31 or greater than 31
     */
    if (s < -31) {


        /**
         *----The required shift amount is less than -31 (i.e., the quotient must be
         *----shifted left by at least 32 bits, thereby overflowing the ADF_I1616
         *----fixed point representation). Set status to ADF_FIXED_MATH_OVERFLOW and
         *----return zero.
         */
        *status = ADF_FIXED_MATH_OVERFLOW;
        return(0);
        

    } else if (s > 31) {


        /**
         *----The required shift amount is greater than 31 (i.e., the quotient must
         *----be shifted right by at least 32 bits, thereby underflowing the
         *----ADF_I1616 fixed point representation). Set status to
         *----ADF_FIXED_MATH_UNDERFLOW and return zero.
         */
        *status = ADF_FIXED_MATH_UNDERFLOW;
        return(0);
    }


    /**
     *----Compute the unsigned quotient of nBits and dBits (i.e., nBits/dBits). Note
     *----that the DIV() function computes the result as a 1.31 fixed point value
     *----whose leading 1 bit is at bit position 30.
     */
    q = DIV(nBits, dBits);


    /**
     *----Determine if s is negative or positive
     */
    if (s < 0) {


        /**
         *----The required shift amount is negative and lies in the range [-31, -1].
         *----Therefore, unnormalizing q requires shifting q to the left by -s bits.
         *----Determine if the unnormalized quotient (i.e., q << (-s)) overflows the
         *----ADF_I1616 fixed point representation.
         */
        if ((q >> (32 + s)) != 0) {


            /**
             *----At least one of the abs(s) most significant bits of q is a 1 bit.
             *----This 1 bit is lost when shifting q to the left by -s bits.
             *----Therefore, the unnormalized quotient (i.e., q << (-s)) overflows
             *----the ADF_I1616 fixed point representation. Set status to
             *----ADF_FIXED_MATH_OVERFLOW and return zero.
             */
            *status = ADF_FIXED_MATH_OVERFLOW;
            return(0);
        }


        /**
         *----The unnormalized quotient does not overflow the ADF_I1616 fixed point
         *----representation. Shift q to the left by -s bits.
         */
        /*lint -e504  Warning 504: Unusual shift operation (unusually formed right argument) */
        q <<= -s;
        /*lint +e504  Warning 504: Unusual shift operation (unusually formed right argument) */


    } else {


        /**
         *----The required shift amount is non-negative and lies in the range [0,
         *----31]. Shift q to the right by s bits.
         */
        q >>= s;


        /**
         *----If the normalized quotient (i.e., q) is zero, then the quotient
         *----underflows the ADF_I1616 fixed point representation. Set status to
         *----ADF_FIXED_MATH_UNDERFLOW and return zero.
         */
        if (q == 0) {
            *status = ADF_FIXED_MATH_UNDERFLOW;
            return(0);
        }
    }


    /**
     *----The unnormalized quotient neither overflows nor underflows the ADF_I1616
     *----fixed point representation. If the quotient is non-negative, return the
     *----quotient. If the quotient is negative, negate the unsigned quotient and
     *----return the result. This step has the effect of rounding the quotient
     *----towards zero.
     */
    return(qSign ? (-((ADF_I32) q)) : q);
}

#ifdef FS_EDGE_RENDER

/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the sine of x, where x is expressed in radians. Both x and the
 *    computed sine of x are ADF_I1616 fixed point values.
 *
 *    Implementation notes:
 *
 *      - I1616_SIN() computes the sine of x using the following approach:
 *
 *          1. Normalize the input x to the range [0, PI / 2) by using repeated
 *             subtraction and the following trigonometric identities:
 *
 *               sin(x) = -sin(-x)
 *               sin(x) =     sin(x + (n * 2 * PI)) for all integers n
 *               sin(x) = -sin(2 * PI - x)
 *               sin(x) =     sin(PI - x)
 *
 *          2. Use the normalized value of x to perform two lookups into a precomputed
 *             table (see k_sincosTable[] below) and linearly interpolate between the
 *             resulting values.
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 *    k_sincosTable is a lookup table for computing the sine and cosine of a value
 *    normalized to the range [0, PI / 2). The table contains 257 elements and is
 *    precomputed as follows.
 *
 *    k_sincosTable[i] = FLOAT_TO_I1616(sin(i * PI/512.0)) for integers i in the range
 *    [0, 256]. If f is a floating point value that lies in the range [0, PI / 2) and
 *    represents an angle in radians, then k_sincosTable[f * 512.0 / PI] contains the
 *    sine of f represented as an ADF_I1616 fixed point value, and k_sincosTable[256 -
 *    (f * 512.0 / PI)] contains the cosine of f represented as an ADF_I1616 fixed
 *    point value. The ability to store precomputed sine and cosine values in the same
 *    table is due to the trigonometric identity sin(x) = cos(PI / 2 - x).
 *-----------------------------------------------------------------------------------
 */
static FS_CONST ADF_I1616 k_sincosTable[] = {
    0x0000, 0x0192, 0x0324, 0x04b6, 0x0648, 0x07da, 0x096c, 0x0afe, 
    0x0c8f, 0x0e21, 0x0fb2, 0x1144, 0x12d5, 0x1466, 0x15f6, 0x1787, 
    0x1917, 0x1aa7, 0x1c37, 0x1dc7, 0x1f56, 0x20e5, 0x2273, 0x2402, 
    0x2590, 0x271d, 0x28aa, 0x2a37, 0x2bc4, 0x2d50, 0x2edb, 0x3066, 
    0x31f1, 0x337b, 0x3505, 0x368e, 0x3817, 0x399f, 0x3b26, 0x3cad, 
    0x3e33, 0x3fb9, 0x413e, 0x42c3, 0x4447, 0x45ca, 0x474d, 0x48ce, 
    0x4a50, 0x4bd0, 0x4d50, 0x4ecf, 0x504d, 0x51ca, 0x5347, 0x54c3, 
    0x563e, 0x57b8, 0x5931, 0x5aaa, 0x5c22, 0x5d98, 0x5f0e, 0x6083, 
    0x61f7, 0x636a, 0x64dc, 0x664d, 0x67bd, 0x692d, 0x6a9b, 0x6c08, 
    0x6d74, 0x6edf, 0x7049, 0x71b1, 0x7319, 0x7480, 0x75e5, 0x774a, 
    0x78ad, 0x7a0f, 0x7b70, 0x7cd0, 0x7e2e, 0x7f8b, 0x80e7, 0x8242, 
    0x839c, 0x84f4, 0x864b, 0x87a1, 0x88f5, 0x8a48, 0x8b9a, 0x8cea, 
    0x8e39, 0x8f87, 0x90d3, 0x921e, 0x9368, 0x94b0, 0x95f6, 0x973c, 
    0x987f, 0x99c2, 0x9b02, 0x9c42, 0x9d7f, 0x9ebc, 0x9ff6, 0xa12f, 
    0xa267, 0xa39d, 0xa4d2, 0xa605, 0xa736, 0xa866, 0xa994, 0xaac0, 
    0xabeb, 0xad14, 0xae3b, 0xaf61, 0xb085, 0xb1a8, 0xb2c8, 0xb3e7, 
    0xb504, 0xb620, 0xb73a, 0xb852, 0xb968, 0xba7c, 0xbb8f, 0xbca0, 
    0xbdae, 0xbebc, 0xbfc7, 0xc0d0, 0xc1d8, 0xc2de, 0xc3e2, 0xc4e3, 
    0xc5e4, 0xc6e2, 0xc7de, 0xc8d8, 0xc9d1, 0xcac7, 0xcbbb, 0xccae, 
    0xcd9f, 0xce8d, 0xcf7a, 0xd064, 0xd14d, 0xd233, 0xd318, 0xd3fa, 
    0xd4db, 0xd5b9, 0xd695, 0xd770, 0xd848, 0xd91e, 0xd9f2, 0xdac4, 
    0xdb94, 0xdc61, 0xdd2d, 0xddf6, 0xdebe, 0xdf83, 0xe046, 0xe106, 
    0xe1c5, 0xe282, 0xe33c, 0xe3f4, 0xe4aa, 0xe55e, 0xe60f, 0xe6be, 
    0xe76b, 0xe816, 0xe8bf, 0xe965, 0xea09, 0xeaab, 0xeb4b, 0xebe8, 
    0xec83, 0xed1c, 0xedb2, 0xee46, 0xeed8, 0xef68, 0xeff5, 0xf080, 
    0xf109, 0xf18f, 0xf213, 0xf294, 0xf314, 0xf391, 0xf40b, 0xf484, 
    0xf4fa, 0xf56d, 0xf5de, 0xf64d, 0xf6ba, 0xf724, 0xf78b, 0xf7f1, 
    0xf853, 0xf8b4, 0xf912, 0xf96e, 0xf9c7, 0xfa1e, 0xfa73, 0xfac5, 
    0xfb14, 0xfb61, 0xfbac, 0xfbf5, 0xfc3b, 0xfc7e, 0xfcbf, 0xfcfe, 
    0xfd3a, 0xfd74, 0xfdab, 0xfde0, 0xfe13, 0xfe43, 0xfe70, 0xfe9b, 
    0xfec4, 0xfeea, 0xff0e, 0xff2f, 0xff4e, 0xff6a, 0xff84, 0xff9c, 
    0xffb1, 0xffc3, 0xffd3, 0xffe1, 0xffec, 0xfff4, 0xfffb, 0xfffe, 
    0x00010000, 
};
/**
 *-----------------------------------------------------------------------------------
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I1616_SIN (ADF_I1616 x)
{
    ADF_I32      indexInt;
    ADF_I32      indexFrac;
    ADF_I1616 sample1;
    ADF_I1616 sample2;
    ADF_I1616 result;
    ADF_I1616 normX;


    /**
     *----Set the isNegative Boolean to false (i.e., assume that the sine of x is
     *----positive)
     */
    ADF_I32 isNegative = 0;


    /**
     *----If x is negative, negate x and set the isNegative Boolean to true (i.e.,
     *----use the identity sin(x) = -sin(-x))
     */
    if (x < 0) {
        x = -x;
        isNegative = 1;
    }

    
    /**
     *----x is now in the range [0, +inf). Repeatedly subtract (2 * PI) until x is in
     *----the range [0, 2 * PI) (i.e., use the identity sin(x) = sin(x + (n * 2 *
     *----PI)) for all integers n).
     */
    while (x >= I1616_CONST_TWO_PI) x -= I1616_CONST_TWO_PI;

    
    /**
     *----x is now in the range [0, 2 * PI). If x lies in the range [PI, 2 * PI),
     *----subtract x from (2 * PI) and flip the isNegative Boolean (i.e., use the
     *----identity sin(x) = -sin(2 * PI - x)).
     */
    if (x >= I1616_CONST_PI) {
        x = I1616_CONST_TWO_PI - x;
        isNegative ^= 1;
    }

    
    /**
     *----x is now in the range [0, PI). If x lies in the range [PI/2, PI), then
     *----subtract x from PI (i.e., use the identity sin(x) = sin(PI - x)).
     */
    if (x >= I1616_CONST_HALF_PI) x = I1616_CONST_PI - x;


    /**
     *----x is now in the range [0, PI / 2). Normalize x to the range [0, 1).
     */
    normX = I1616_MUL(x, I1616_CONST_TWO_OVER_PI);


    /**
     *----Determine the 8 integer bits of the index into the lookup table
     */
    indexInt = (normX >> 8) & 0xff;


    /**
     *----Determine the 8 fractional bits of the index into the lookup table
     */
    indexFrac = (normX << 8) & 0x0000ffff;


    /**
     *----Determine the sine of x by performing two table lookups and linearly
     *----interpolating between the results
     */
    sample1 = k_sincosTable[indexInt];
    sample2 = k_sincosTable[indexInt + 1];
    result = sample1 + I1616_MUL(indexFrac, sample2 - sample1);


    /**
     *----Return the signed result
     */
    return(isNegative ? -result : result);
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the cosine of x, where x is expressed in radians. Both x and
 *    the computed cosine of x are ADF_I1616 fixed point values.
 *
 *    Implementation notes:
 *
 *      - I1616_COS() computes the cosine of x using the following approach:
 *
 *          1. Normalize the input x to the range [0, PI / 2) by using repeated
 *             subtraction and the following trigonometric identities:
 *
 *               cos(x) =     cos(-x)
 *               cos(x) =     cos(x + (n * 2 * PI)) for all integers n
 *               cos(x) = -cos(2 * PI - x)
 *               cos(x) = -cos(PI - x)
 *
 *          2. Use the normalized value of x to perform two lookups into a precomputed
 *             table (see k_sincosTable[] above) and linearly interpolate between the
 *             resulting values.
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I1616_COS (ADF_I1616 x)
{
    ADF_I32      indexInt;
    ADF_I32      indexFrac;
    ADF_I1616 sample1;
    ADF_I1616 sample2;
    ADF_I1616 result;
    ADF_I1616 normX;


    /**
     *----Set the isNegative Boolean to false (i.e., assume that the cosine of x is
     *----positive)
     */
    ADF_I32 isNegative = 0;


    /**
     *----If x is negative, negate x (i.e., use the identity cos(x) = cos(-x))
     */
    if (x < 0) x = -x;

    
    /**
     *----x is now in the range [0, +inf). Repeatedly subtract (2 * PI) until x is in
     *----the range [0, 2 * PI) (i.e., use the identity cos(x) = cos(x + (n * 2 *
     *----PI)) for all integers n).
     */
    while (x >= I1616_CONST_TWO_PI) x -= I1616_CONST_TWO_PI;

    
    /**
     *----x is now in the range [0, 2 * PI). If x lies in the range [PI, 2 * PI),
     *----subtract x from (2 * PI) (i.e., use the identity cos(x) = cos(2 * PI - x)).
     */
    if (x >= I1616_CONST_PI) x = I1616_CONST_TWO_PI - x;

    
    /**
     *----x is now in the range [0, PI). If x lies in the range [PI/2, PI), then
     *----subtract x from PI and set the isNegative Boolean to true (i.e., use the
     *----identity cos(PI - x) = -cos(x).
     */
    if (x >= I1616_CONST_HALF_PI) {
        x = I1616_CONST_PI - x;
        isNegative = 1;
    }


    /**
     *----x is now in the range [0, PI / 2). Normalize x to the range [0, 1).
     */
    normX = I1616_MUL(x, I1616_CONST_TWO_OVER_PI);


    /**
     *----Determine the 8 integer bits of the index into the lookup table
     */
    indexInt = 256 - ((normX >> 8) & 0xff);


    /**
     *----Determine the 8 fractional bits of the index into the lookup table
     */
    indexFrac = (normX << 8) & 0x0000ffff;


    /**
     *----Determine the cosine of x by performing two table lookups and linearly
     *----interpolating between the results
     */
    sample1 = k_sincosTable[indexInt];
    sample2 = k_sincosTable[indexInt - 1];
    result = sample1 + I1616_MUL(indexFrac, sample2 - sample1);


    /**
     *----Return the signed result
     */
    return(isNegative ? -result : result);
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return the base 2 logarithm of f, where the input f is an ADF_I1616
 *    fixed point value that lies in the mathematical range (0, 1) (note the exclusion
 *    of the values 0 and 1). The computed result is an ADF_I1616 fixed point value.
 *
 *    Implementation notes:
 *
 *      - Log2Input01() computes the base 2 logarithm of f using the following steps:
 *
 *          1. Let f = m * 2^e, where e is an integer exponent and m is a mantissa
 *             normalized to the mathematical range [1, 2) (i.e., the integer portion
 *             of m is 1). Note that log2(f) = log2(m * 2^e) = log2(m) + log2(2^e) =
 *             log2(m) + e. This derivation makes use of the identity log2(a * b) =
 *             log2(a) + log2(b) for all values a and b.
 *
 *          2. f is a 16.16 fixed point value, so the mantissa m must have its leading
 *             1 bit at bit position 16. Since f = m * 2^e = m << e, the integer
 *             exponent e can be computed by counting the number of leading zeroes of f
 *             and subtracting the result from 15. For example, if the number of
 *             leading zeroes of f is 17, then the leading 1 bit of f is in bit
 *             position 14. Therefore, the exponent e is 15 - 17 = -2. Note that since
 *             f lies in the mathematical range (0, 1), the exponent e is always
 *             negative.
 *
 *          3. Compute the mantissa m by evaluating m = (f / 2^e) = (f * 2^(-e)). Since
 *             the exponent e is negative, this step is implemented by shifting f to
 *             the left by -e bits.
 *
 *          4. Compute log2(m) using a second order Taylor expansion. The Taylor
 *             expansion of log2(x) centered at x = 1 is (1 / ln(2)) * (-1 + x -
 *             ((x-1)^2 / 3) + ((x-1)^3 / 3) - ((x-1)^4 / 4) + ...), where ln(2)
 *             denotes the natural logarithm of 2. Instead of using this formula
 *             directly, however, the implementation below uses the modified formula
 *             log2(m) = (-1 - 2*k + (x * (1 + (3 * k))) - (x^2 * k)), where k =
 *             0.3466033935546875. The reason for using this modified formula is that
 *             the Taylor series expansion described above is centered at x = 1, but m
 *             lies in the mathematical range [1, 2). Therefore, using the second order
 *             Taylor series expansion directly provides accurate results when m is
 *             close to 1 but less accurate results when m is close to 2. The modified
 *             formula compensates for the error when m approaches 2 and hence provides
 *             better accuracy throughout the entire range [1, 2).
 *
 *          5. Compute log2(f) by adding e to log2(m). Recall from step 1 that log2(f)
 *             = log2(m * 2^e) = log2(m) + log2(2^e) = log2(m) + e.
 *-----------------------------------------------------------------------------------
 */
static ADF_I1616 Log2Input01 (ADF_I1616 f)
{
    /**
     *----Let f = m * 2^e, where e is an integer exponent and m is a mantissa
     *----normalized to the mathematical range [1, 2). Compute the integer exponent e
     *----by subtracting the number of leading zeroes of f from 15 (see the above
     *----documentation for the derivation of this formula).
     */
    ADF_I32 e = 15 - CountLeadingZeroes(f);


    /**
     *----Determine the mantissa m by normalizing f so that the leading 1 bit is at
     *----bit position 16
     */
    /*lint -e504  Warning 504: Unusual shift operation (unusually formed right argument) */
    ADF_I1616 m = f << (-e);
    /*lint +e504  Warning 504: Unusual shift operation (unusually formed right argument) */


    /**
     *----Compute the base 2 logarithm of m using a modified second order Taylor
     *----series expansion of log2(x) (see the documentation above)
     */
    ADF_I1616 frac = m & 0xffff;
    ADF_I1616 log2m = m + I1616_MUL(frac - I1616_MUL(frac, frac), 0x58bb) - 0x10000;


    /**
     *----Since f = m * 2^e, log2(f) = log2(m * 2^e) = log2(m) + log2(2^e) = log2(m)
     *----+ e. Compute log2(f) by converting e from an integer to a 16.16 fixed point
     *----value and adding the result to log2(m).
     */
    return(log2m + (e << 16));
}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return 2^f, where the input f is a negative ADF_I1616 fixed point
 *    value. The computed result is an ADF_I1616 fixed point value.
 *
 *    Implementation notes:
 *
 *      - Exp2InputNeg() computes 2^f using the following steps:
 *
 *          1. Let the input f = log2(y) for some value y, where log2(y) denotes the
 *             base 2 logarithm of y. Then 2^f = 2^(log2(y)) = y.
 *
 *          2. Let y = m * 2^e, where e is an integer exponent and m is a mantissa
 *             normalized to the mathematical range [1, 2) (i.e., the integer portion
 *             of m is 1).
 *
 *          3. Note that f = log2(y) = log2(m * 2^e) = log2(m) + log2(2^e) = log2(m) +
 *             e. This derivation makes use of the identity log2(a * b) = log2(a) +
 *             log2(b) for all values a and b. Since the mantissa m lies in the
 *             mathematical range [1, 2), then log2(m) must lie in the mathematical
 *             range [0, 1). It follows that the integer part of f (i.e., floor(f)) is
 *             equal to e and the fractional part of f (i.e., f - floor(f)) is equal to
 *             log2(m).
 *
 *          4. Compute the mantissa m by evaluating 2^log2(m). This exponentiation can
 *             be evaluated using a second order Taylor series expansion of 2^x. The
 *             Taylor series expansion of 2^x centered at x = 0 is 1 + (x * ln(2)) +
 *             ((x^2 * ln(2)^2) / 2!) + ((x^3 * ln(2)^3) / 3!) + ..., where ln(2)
 *             denotes the natural logarithm of 2. Instead of using this formula
 *             directly, however, the implementation below uses the modified formula m
 *             = (1 + (x * (1 - k)) + (k * x^2)), where k is 0.3397064208984375. The
 *             reason for using this modified formula is that the Taylor series
 *             expansion described above is centered at x = 0, but log2(m) lies in the
 *             mathematical range [0, 1). Therefore, using the second order Taylor
 *             series expansion directly provides accurate results when log2(m) is
 *             close to zero but less accurate results when log2(m) is close to 1. The
 *             modified formula compensates for the error when log2(m) approaches 1 and
 *             hence provides better accuracy throughout the entire range [0, 1).
 *
 *          5. 2^f can be computed by multiplying m by 2^e. Recall from step 1 that 2^f
 *             = 2^(log2(y)) = y = m * 2^e.
 *-----------------------------------------------------------------------------------
 */
static ADF_I1616 Exp2InputNeg (ADF_I1616 f)
{
    /**
     *----Set log2m (i.e., the base 2 logarithm of the mantissa m) to the fractional
     *----bits of the input f
     */
    ADF_I1616 log2m = (f & 0xffff);


    /**
     *----Compute the 16.16 fixed point mantissa m as 2^log2(m) using a modified
     *----second order Taylor series expansion of 2^x: 2^x ~= (1 + x - (k * (x -
     *----x^2))), where k is approximately 0.3397064208984375. See the documentation
     *----above for a brief explanation of this formula. Note that k is 0x56f7 when
     *----expressed as a 16.16 fixed point value.
     */
    ADF_I1616 m = 0x00010000 + log2m - I1616_MUL(
    log2m - I1616_MUL(log2m, log2m), 0x56f7);


    /**
     *----Set the integer exponent e to the integer portion of the input f (i.e., the
     *----high 16 bits of f). Note that f is a negative value and that the ANSI C
     *----specification does not guarantee that the sign bit of f will be extended
     *----when shifting f to the right by 16 bits. Therefore, perform sign extension
     *----manually by setting the high 16 bits of the result to 1 bits.
     */
    ADF_I32 e = (0xffff0000 | (f >> 16));


    /**
     *----Since f = log2(y) = log2(m * 2^e) = log2(m) + e, then 2^f = 2^(log2(y)) = y
     *----= m * 2^e. Compute 2^f by multiplying the mantissa m by 2^e. Since e is
     *----negative, this is equivalent to shifting m to the right by -e bits.
     */
    /*lint -e504  Warning 504: Unusual shift operation (unusually formed right argument) */
    return(m >> (-e));
    /*lint +e504  Warning 504: Unusual shift operation (unusually formed right argument) */

}


/**
 *-----------------------------------------------------------------------------------
 *    Compute and return x^y, where the input x must lie in the range (0,1) (note the
 *    exclusion of 0 and 1) and the input y must be positive. The inputs x and y and
 *    the computed result are ADF_I1616 fixed point values.
 *-----------------------------------------------------------------------------------
 */
ADF_I1616 I1616_POW01 (ADF_I1616 x, ADF_I1616 y)
{
    /**
     *----Compute and return x^y by applying the following identities: x^y =
     *----2^(log2(x^y)) = 2^(y*log2(x))
     */
    return(Exp2InputNeg(I1616_MUL(y, Log2Input01(x))));
}

#endif /* FS_EDGE_RENDER */
/**
 *-----------------------------------------------------------------------------------
 *    END: FIXED POINT MATH ONLY
 *-----------------------------------------------------------------------------------
 */

/**
 *-----------------------------------------------------------------------------------
 *    END: iType ADF Rendering
 *-----------------------------------------------------------------------------------
 */
#endif /* FS_EDGE_HINTS or FS_EDGE_RENDER */
